1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved stores. 17 18define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { 19; SSE-LABEL: store_i64_stride8_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 22; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 23; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 24; SSE-NEXT: movaps (%rdi), %xmm0 25; SSE-NEXT: movaps (%rsi), %xmm1 26; SSE-NEXT: movaps (%rdx), %xmm2 27; SSE-NEXT: movaps (%rcx), %xmm3 28; SSE-NEXT: movaps (%r8), %xmm4 29; SSE-NEXT: movaps (%r9), %xmm5 30; SSE-NEXT: movaps (%r11), %xmm6 31; SSE-NEXT: movaps (%r10), %xmm7 32; SSE-NEXT: movaps %xmm0, %xmm8 33; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] 34; SSE-NEXT: movaps %xmm2, %xmm9 35; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm3[0] 36; SSE-NEXT: movaps %xmm6, %xmm10 37; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm7[0] 38; SSE-NEXT: movaps %xmm4, %xmm11 39; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm5[0] 40; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 41; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] 42; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] 43; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] 44; SSE-NEXT: movaps %xmm4, 96(%rax) 45; SSE-NEXT: movaps %xmm6, 112(%rax) 46; SSE-NEXT: movaps %xmm2, 80(%rax) 47; SSE-NEXT: movaps %xmm0, 64(%rax) 48; SSE-NEXT: movaps %xmm11, 32(%rax) 49; SSE-NEXT: movaps %xmm10, 48(%rax) 50; SSE-NEXT: movaps %xmm9, 16(%rax) 51; SSE-NEXT: movaps %xmm8, (%rax) 52; SSE-NEXT: retq 53; 54; AVX-LABEL: store_i64_stride8_vf2: 55; AVX: # %bb.0: 56; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 57; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 58; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11 59; AVX-NEXT: vmovaps (%rdi), %xmm0 60; AVX-NEXT: vmovaps (%rsi), %xmm1 61; AVX-NEXT: vmovaps (%r8), %xmm2 62; AVX-NEXT: vmovaps (%r9), %xmm3 63; AVX-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 64; AVX-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 65; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 66; AVX-NEXT: vinsertf128 $1, (%r11), %ymm3, %ymm3 67; AVX-NEXT: vinsertf128 $1, (%r10), %ymm2, %ymm2 68; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 69; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 70; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 71; AVX-NEXT: vmovaps %ymm1, 96(%rax) 72; AVX-NEXT: vmovaps %ymm0, 64(%rax) 73; AVX-NEXT: vmovaps %ymm5, 32(%rax) 74; AVX-NEXT: vmovaps %ymm4, (%rax) 75; AVX-NEXT: vzeroupper 76; AVX-NEXT: retq 77; 78; AVX2-LABEL: store_i64_stride8_vf2: 79; AVX2: # %bb.0: 80; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 81; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 82; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 83; AVX2-NEXT: vmovaps (%rdi), %xmm0 84; AVX2-NEXT: vmovaps (%rdx), %xmm1 85; AVX2-NEXT: vmovaps (%r8), %xmm2 86; AVX2-NEXT: vmovaps (%r11), %xmm3 87; AVX2-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 88; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 89; AVX2-NEXT: vinsertf128 $1, (%r9), %ymm2, %ymm2 90; AVX2-NEXT: vinsertf128 $1, (%r10), %ymm3, %ymm3 91; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 92; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] 93; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 94; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3] 95; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 96; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 97; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 98; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] 99; AVX2-NEXT: vmovaps %ymm1, 96(%rax) 100; AVX2-NEXT: vmovaps %ymm0, 64(%rax) 101; AVX2-NEXT: vmovaps %ymm5, 32(%rax) 102; AVX2-NEXT: vmovaps %ymm4, (%rax) 103; AVX2-NEXT: vzeroupper 104; AVX2-NEXT: retq 105; 106; AVX2-FP-LABEL: store_i64_stride8_vf2: 107; AVX2-FP: # %bb.0: 108; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 109; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 110; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11 111; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0 112; AVX2-FP-NEXT: vmovaps (%rdx), %xmm1 113; AVX2-FP-NEXT: vmovaps (%r8), %xmm2 114; AVX2-FP-NEXT: vmovaps (%r11), %xmm3 115; AVX2-FP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 116; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 117; AVX2-FP-NEXT: vinsertf128 $1, (%r9), %ymm2, %ymm2 118; AVX2-FP-NEXT: vinsertf128 $1, (%r10), %ymm3, %ymm3 119; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 120; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] 121; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 122; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3] 123; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 124; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 125; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 126; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] 127; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rax) 128; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax) 129; AVX2-FP-NEXT: vmovaps %ymm5, 32(%rax) 130; AVX2-FP-NEXT: vmovaps %ymm4, (%rax) 131; AVX2-FP-NEXT: vzeroupper 132; AVX2-FP-NEXT: retq 133; 134; AVX2-FCP-LABEL: store_i64_stride8_vf2: 135; AVX2-FCP: # %bb.0: 136; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 137; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 138; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 139; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0 140; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm1 141; AVX2-FCP-NEXT: vmovaps (%r8), %xmm2 142; AVX2-FCP-NEXT: vmovaps (%r11), %xmm3 143; AVX2-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 144; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 145; AVX2-FCP-NEXT: vinsertf128 $1, (%r9), %ymm2, %ymm2 146; AVX2-FCP-NEXT: vinsertf128 $1, (%r10), %ymm3, %ymm3 147; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 148; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] 149; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 150; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3] 151; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 152; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 153; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 154; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] 155; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rax) 156; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax) 157; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rax) 158; AVX2-FCP-NEXT: vmovaps %ymm4, (%rax) 159; AVX2-FCP-NEXT: vzeroupper 160; AVX2-FCP-NEXT: retq 161; 162; AVX512-LABEL: store_i64_stride8_vf2: 163; AVX512: # %bb.0: 164; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 165; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 166; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 167; AVX512-NEXT: vmovdqa (%rdi), %xmm0 168; AVX512-NEXT: vmovdqa (%rdx), %xmm1 169; AVX512-NEXT: vmovdqa (%r8), %xmm2 170; AVX512-NEXT: vmovdqa (%r11), %xmm3 171; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 172; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 173; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 174; AVX512-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 175; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 176; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 177; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] 178; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 179; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] 180; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 181; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rax) 182; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) 183; AVX512-NEXT: vzeroupper 184; AVX512-NEXT: retq 185; 186; AVX512-FCP-LABEL: store_i64_stride8_vf2: 187; AVX512-FCP: # %bb.0: 188; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 189; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 190; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 191; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 192; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 193; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 194; AVX512-FCP-NEXT: vmovdqa (%r11), %xmm3 195; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 196; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 197; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 198; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 199; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 200; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 201; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] 202; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 203; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] 204; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 205; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) 206; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) 207; AVX512-FCP-NEXT: vzeroupper 208; AVX512-FCP-NEXT: retq 209; 210; AVX512DQ-LABEL: store_i64_stride8_vf2: 211; AVX512DQ: # %bb.0: 212; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 213; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 214; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 215; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 216; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 217; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 218; AVX512DQ-NEXT: vmovdqa (%r11), %xmm3 219; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 220; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 221; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 222; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 223; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 224; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 225; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] 226; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 227; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] 228; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 229; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rax) 230; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) 231; AVX512DQ-NEXT: vzeroupper 232; AVX512DQ-NEXT: retq 233; 234; AVX512DQ-FCP-LABEL: store_i64_stride8_vf2: 235; AVX512DQ-FCP: # %bb.0: 236; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 237; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 238; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 239; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 240; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 241; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 242; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %xmm3 243; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 244; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 245; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 246; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 247; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 248; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 249; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] 250; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 251; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] 252; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 253; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) 254; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) 255; AVX512DQ-FCP-NEXT: vzeroupper 256; AVX512DQ-FCP-NEXT: retq 257; 258; AVX512BW-LABEL: store_i64_stride8_vf2: 259; AVX512BW: # %bb.0: 260; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 261; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 262; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 263; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 264; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 265; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 266; AVX512BW-NEXT: vmovdqa (%r11), %xmm3 267; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 268; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 269; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 270; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 271; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 272; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 273; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] 274; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 275; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] 276; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 277; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) 278; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) 279; AVX512BW-NEXT: vzeroupper 280; AVX512BW-NEXT: retq 281; 282; AVX512BW-FCP-LABEL: store_i64_stride8_vf2: 283; AVX512BW-FCP: # %bb.0: 284; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 285; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 286; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 287; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 288; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 289; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 290; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3 291; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 292; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 293; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 294; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 295; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 296; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 297; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] 298; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 299; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] 300; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 301; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) 302; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) 303; AVX512BW-FCP-NEXT: vzeroupper 304; AVX512BW-FCP-NEXT: retq 305; 306; AVX512DQ-BW-LABEL: store_i64_stride8_vf2: 307; AVX512DQ-BW: # %bb.0: 308; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 309; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 310; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 311; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 312; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 313; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 314; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm3 315; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 316; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 317; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 318; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 319; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 320; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 321; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] 322; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 323; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] 324; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 325; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rax) 326; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) 327; AVX512DQ-BW-NEXT: vzeroupper 328; AVX512DQ-BW-NEXT: retq 329; 330; AVX512DQ-BW-FCP-LABEL: store_i64_stride8_vf2: 331; AVX512DQ-BW-FCP: # %bb.0: 332; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 333; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 334; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 335; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 336; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 337; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 338; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3 339; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 340; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 341; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 342; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 343; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 344; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 345; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] 346; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 347; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] 348; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 349; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) 350; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) 351; AVX512DQ-BW-FCP-NEXT: vzeroupper 352; AVX512DQ-BW-FCP-NEXT: retq 353 %in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 64 354 %in.vec1 = load <2 x i64>, ptr %in.vecptr1, align 64 355 %in.vec2 = load <2 x i64>, ptr %in.vecptr2, align 64 356 %in.vec3 = load <2 x i64>, ptr %in.vecptr3, align 64 357 %in.vec4 = load <2 x i64>, ptr %in.vecptr4, align 64 358 %in.vec5 = load <2 x i64>, ptr %in.vecptr5, align 64 359 %in.vec6 = load <2 x i64>, ptr %in.vecptr6, align 64 360 %in.vec7 = load <2 x i64>, ptr %in.vecptr7, align 64 361 %1 = shufflevector <2 x i64> %in.vec0, <2 x i64> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 362 %2 = shufflevector <2 x i64> %in.vec2, <2 x i64> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 363 %3 = shufflevector <2 x i64> %in.vec4, <2 x i64> %in.vec5, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 364 %4 = shufflevector <2 x i64> %in.vec6, <2 x i64> %in.vec7, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 365 %5 = shufflevector <4 x i64> %1, <4 x i64> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 366 %6 = shufflevector <4 x i64> %3, <4 x i64> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 367 %7 = shufflevector <8 x i64> %5, <8 x i64> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 368 %interleaved.vec = shufflevector <16 x i64> %7, <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 369 store <16 x i64> %interleaved.vec, ptr %out.vec, align 64 370 ret void 371} 372 373define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { 374; SSE-LABEL: store_i64_stride8_vf4: 375; SSE: # %bb.0: 376; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 377; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 378; SSE-NEXT: movaps (%rdi), %xmm4 379; SSE-NEXT: movaps 16(%rdi), %xmm2 380; SSE-NEXT: movaps (%rsi), %xmm10 381; SSE-NEXT: movaps 16(%rsi), %xmm14 382; SSE-NEXT: movaps (%rdx), %xmm1 383; SSE-NEXT: movaps 16(%rdx), %xmm3 384; SSE-NEXT: movaps (%rcx), %xmm7 385; SSE-NEXT: movaps 16(%rcx), %xmm12 386; SSE-NEXT: movaps (%r8), %xmm5 387; SSE-NEXT: movaps 16(%r8), %xmm0 388; SSE-NEXT: movaps (%r9), %xmm13 389; SSE-NEXT: movaps (%r10), %xmm6 390; SSE-NEXT: movaps 16(%r10), %xmm9 391; SSE-NEXT: movaps (%rax), %xmm15 392; SSE-NEXT: movaps 16(%rax), %xmm11 393; SSE-NEXT: movaps %xmm1, %xmm8 394; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] 395; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 396; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] 397; SSE-NEXT: movaps %xmm4, %xmm7 398; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] 399; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] 400; SSE-NEXT: movaps %xmm3, %xmm10 401; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm12[0] 402; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] 403; SSE-NEXT: movaps %xmm2, %xmm12 404; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm14[0] 405; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] 406; SSE-NEXT: movaps %xmm6, %xmm14 407; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] 408; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] 409; SSE-NEXT: movaps %xmm5, %xmm15 410; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm13[0] 411; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] 412; SSE-NEXT: movaps %xmm9, %xmm13 413; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] 414; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] 415; SSE-NEXT: movaps 16(%r9), %xmm11 416; SSE-NEXT: movaps %xmm0, %xmm8 417; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm11[0] 418; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] 419; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 420; SSE-NEXT: movaps %xmm0, 224(%rax) 421; SSE-NEXT: movaps %xmm9, 240(%rax) 422; SSE-NEXT: movaps %xmm8, 160(%rax) 423; SSE-NEXT: movaps %xmm13, 176(%rax) 424; SSE-NEXT: movaps %xmm5, 96(%rax) 425; SSE-NEXT: movaps %xmm6, 112(%rax) 426; SSE-NEXT: movaps %xmm15, 32(%rax) 427; SSE-NEXT: movaps %xmm14, 48(%rax) 428; SSE-NEXT: movaps %xmm2, 192(%rax) 429; SSE-NEXT: movaps %xmm3, 208(%rax) 430; SSE-NEXT: movaps %xmm12, 128(%rax) 431; SSE-NEXT: movaps %xmm10, 144(%rax) 432; SSE-NEXT: movaps %xmm4, 64(%rax) 433; SSE-NEXT: movaps %xmm1, 80(%rax) 434; SSE-NEXT: movaps %xmm7, (%rax) 435; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 436; SSE-NEXT: movaps %xmm0, 16(%rax) 437; SSE-NEXT: retq 438; 439; AVX-LABEL: store_i64_stride8_vf4: 440; AVX: # %bb.0: 441; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 442; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 443; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11 444; AVX-NEXT: vmovaps (%rdx), %ymm2 445; AVX-NEXT: vmovaps (%rcx), %ymm3 446; AVX-NEXT: vmovaps (%r11), %ymm1 447; AVX-NEXT: vmovaps (%r10), %ymm4 448; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] 449; AVX-NEXT: vmovaps 16(%r9), %xmm5 450; AVX-NEXT: vmovaps 16(%r8), %xmm6 451; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] 452; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] 453; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] 454; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm5[1] 455; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] 456; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 457; AVX-NEXT: vmovaps 16(%rsi), %xmm5 458; AVX-NEXT: vmovaps 16(%rdi), %xmm6 459; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm5[1] 460; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] 461; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 462; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm6[0],xmm5[0] 463; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 464; AVX-NEXT: vmovaps (%r9), %xmm3 465; AVX-NEXT: vmovaps (%r8), %xmm5 466; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm5[1],xmm3[1] 467; AVX-NEXT: vmovaps (%r10), %xmm7 468; AVX-NEXT: vmovaps (%r11), %xmm8 469; AVX-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1] 470; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm5[0],xmm3[0] 471; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm8[0],xmm7[0] 472; AVX-NEXT: vmovaps (%rsi), %xmm7 473; AVX-NEXT: vmovaps (%rdi), %xmm8 474; AVX-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm8[1],xmm7[1] 475; AVX-NEXT: vmovaps (%rcx), %xmm11 476; AVX-NEXT: vmovaps (%rdx), %xmm12 477; AVX-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm12[1],xmm11[1] 478; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0] 479; AVX-NEXT: vmovlhps {{.*#+}} xmm8 = xmm12[0],xmm11[0] 480; AVX-NEXT: vmovaps %xmm8, 16(%rax) 481; AVX-NEXT: vmovaps %xmm7, (%rax) 482; AVX-NEXT: vmovaps %xmm13, 80(%rax) 483; AVX-NEXT: vmovaps %xmm10, 64(%rax) 484; AVX-NEXT: vmovaps %xmm5, 48(%rax) 485; AVX-NEXT: vmovaps %xmm3, 32(%rax) 486; AVX-NEXT: vmovaps %xmm9, 112(%rax) 487; AVX-NEXT: vmovaps %xmm6, 96(%rax) 488; AVX-NEXT: vmovaps %ymm2, 128(%rax) 489; AVX-NEXT: vmovaps %ymm4, 192(%rax) 490; AVX-NEXT: vmovaps %ymm1, 224(%rax) 491; AVX-NEXT: vmovaps %ymm0, 160(%rax) 492; AVX-NEXT: vzeroupper 493; AVX-NEXT: retq 494; 495; AVX2-LABEL: store_i64_stride8_vf4: 496; AVX2: # %bb.0: 497; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 498; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 499; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 500; AVX2-NEXT: vmovaps (%rdi), %ymm0 501; AVX2-NEXT: vmovaps (%rsi), %ymm1 502; AVX2-NEXT: vmovaps (%rdx), %ymm2 503; AVX2-NEXT: vmovaps (%rcx), %ymm3 504; AVX2-NEXT: vmovaps (%r8), %ymm4 505; AVX2-NEXT: vmovaps (%r9), %ymm5 506; AVX2-NEXT: vmovaps (%r11), %ymm6 507; AVX2-NEXT: vmovaps (%r10), %ymm7 508; AVX2-NEXT: vmovaps (%r9), %xmm8 509; AVX2-NEXT: vinsertf128 $1, (%r10), %ymm8, %ymm8 510; AVX2-NEXT: vmovaps (%r8), %xmm9 511; AVX2-NEXT: vinsertf128 $1, (%r11), %ymm9, %ymm9 512; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] 513; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] 514; AVX2-NEXT: vmovaps (%rsi), %xmm9 515; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm9, %ymm9 516; AVX2-NEXT: vmovaps (%rdi), %xmm11 517; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm11, %ymm11 518; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] 519; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] 520; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] 521; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] 522; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm13[2,3],ymm11[2,3] 523; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] 524; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] 525; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] 526; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 527; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 528; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] 529; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 530; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 531; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 532; AVX2-NEXT: vmovaps %ymm0, 128(%rax) 533; AVX2-NEXT: vmovaps %ymm5, 192(%rax) 534; AVX2-NEXT: vmovaps %ymm4, 224(%rax) 535; AVX2-NEXT: vmovaps %ymm11, 160(%rax) 536; AVX2-NEXT: vmovaps %ymm9, 64(%rax) 537; AVX2-NEXT: vmovaps %ymm12, (%rax) 538; AVX2-NEXT: vmovaps %ymm8, 96(%rax) 539; AVX2-NEXT: vmovaps %ymm10, 32(%rax) 540; AVX2-NEXT: vzeroupper 541; AVX2-NEXT: retq 542; 543; AVX2-FP-LABEL: store_i64_stride8_vf4: 544; AVX2-FP: # %bb.0: 545; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 546; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 547; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11 548; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 549; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1 550; AVX2-FP-NEXT: vmovaps (%rdx), %ymm2 551; AVX2-FP-NEXT: vmovaps (%rcx), %ymm3 552; AVX2-FP-NEXT: vmovaps (%r8), %ymm4 553; AVX2-FP-NEXT: vmovaps (%r9), %ymm5 554; AVX2-FP-NEXT: vmovaps (%r11), %ymm6 555; AVX2-FP-NEXT: vmovaps (%r10), %ymm7 556; AVX2-FP-NEXT: vmovaps (%r9), %xmm8 557; AVX2-FP-NEXT: vinsertf128 $1, (%r10), %ymm8, %ymm8 558; AVX2-FP-NEXT: vmovaps (%r8), %xmm9 559; AVX2-FP-NEXT: vinsertf128 $1, (%r11), %ymm9, %ymm9 560; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] 561; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] 562; AVX2-FP-NEXT: vmovaps (%rsi), %xmm9 563; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm9, %ymm9 564; AVX2-FP-NEXT: vmovaps (%rdi), %xmm11 565; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm11, %ymm11 566; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] 567; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] 568; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] 569; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] 570; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm13[2,3],ymm11[2,3] 571; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] 572; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] 573; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] 574; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 575; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 576; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] 577; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 578; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 579; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 580; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rax) 581; AVX2-FP-NEXT: vmovaps %ymm5, 192(%rax) 582; AVX2-FP-NEXT: vmovaps %ymm4, 224(%rax) 583; AVX2-FP-NEXT: vmovaps %ymm11, 160(%rax) 584; AVX2-FP-NEXT: vmovaps %ymm9, 64(%rax) 585; AVX2-FP-NEXT: vmovaps %ymm12, (%rax) 586; AVX2-FP-NEXT: vmovaps %ymm8, 96(%rax) 587; AVX2-FP-NEXT: vmovaps %ymm10, 32(%rax) 588; AVX2-FP-NEXT: vzeroupper 589; AVX2-FP-NEXT: retq 590; 591; AVX2-FCP-LABEL: store_i64_stride8_vf4: 592; AVX2-FCP: # %bb.0: 593; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 594; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 595; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 596; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 597; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1 598; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm2 599; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm3 600; AVX2-FCP-NEXT: vmovaps (%r8), %ymm4 601; AVX2-FCP-NEXT: vmovaps (%r9), %ymm5 602; AVX2-FCP-NEXT: vmovaps (%r11), %ymm6 603; AVX2-FCP-NEXT: vmovaps (%r10), %ymm7 604; AVX2-FCP-NEXT: vmovaps (%r9), %xmm8 605; AVX2-FCP-NEXT: vinsertf128 $1, (%r10), %ymm8, %ymm8 606; AVX2-FCP-NEXT: vmovaps (%r8), %xmm9 607; AVX2-FCP-NEXT: vinsertf128 $1, (%r11), %ymm9, %ymm9 608; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] 609; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] 610; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm9 611; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm9, %ymm9 612; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm11 613; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm11, %ymm11 614; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] 615; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] 616; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] 617; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] 618; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm13[2,3],ymm11[2,3] 619; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] 620; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] 621; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] 622; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 623; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 624; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] 625; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 626; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 627; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 628; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rax) 629; AVX2-FCP-NEXT: vmovaps %ymm5, 192(%rax) 630; AVX2-FCP-NEXT: vmovaps %ymm4, 224(%rax) 631; AVX2-FCP-NEXT: vmovaps %ymm11, 160(%rax) 632; AVX2-FCP-NEXT: vmovaps %ymm9, 64(%rax) 633; AVX2-FCP-NEXT: vmovaps %ymm12, (%rax) 634; AVX2-FCP-NEXT: vmovaps %ymm8, 96(%rax) 635; AVX2-FCP-NEXT: vmovaps %ymm10, 32(%rax) 636; AVX2-FCP-NEXT: vzeroupper 637; AVX2-FCP-NEXT: retq 638; 639; AVX512-LABEL: store_i64_stride8_vf4: 640; AVX512: # %bb.0: 641; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 642; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 643; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 644; AVX512-NEXT: vmovdqa (%rdi), %ymm0 645; AVX512-NEXT: vmovdqa (%rdx), %ymm1 646; AVX512-NEXT: vmovdqa (%r8), %ymm2 647; AVX512-NEXT: vmovdqa (%r11), %ymm3 648; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 649; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 650; AVX512-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 651; AVX512-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 652; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] 653; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] 654; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 655; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 656; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 657; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] 658; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] 659; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 660; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 661; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 662; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 663; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] 664; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] 665; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 666; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 667; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 668; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 669; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] 670; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] 671; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 672; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 673; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 674; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] 675; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) 676; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rax) 677; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rax) 678; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) 679; AVX512-NEXT: vzeroupper 680; AVX512-NEXT: retq 681; 682; AVX512-FCP-LABEL: store_i64_stride8_vf4: 683; AVX512-FCP: # %bb.0: 684; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 685; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 686; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 687; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 688; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm1 689; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm2 690; AVX512-FCP-NEXT: vmovdqa (%r11), %ymm3 691; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 692; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 693; AVX512-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 694; AVX512-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 695; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] 696; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] 697; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 698; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 699; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 700; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] 701; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] 702; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 703; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 704; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 705; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 706; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] 707; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] 708; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 709; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 710; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 711; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 712; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] 713; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] 714; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 715; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 716; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 717; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] 718; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) 719; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) 720; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) 721; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 722; AVX512-FCP-NEXT: vzeroupper 723; AVX512-FCP-NEXT: retq 724; 725; AVX512DQ-LABEL: store_i64_stride8_vf4: 726; AVX512DQ: # %bb.0: 727; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 728; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 729; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 730; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 731; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 732; AVX512DQ-NEXT: vmovdqa (%r8), %ymm2 733; AVX512DQ-NEXT: vmovdqa (%r11), %ymm3 734; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 735; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 736; AVX512DQ-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 737; AVX512DQ-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 738; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] 739; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] 740; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 741; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 742; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 743; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] 744; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] 745; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 746; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6 747; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 748; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 749; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] 750; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] 751; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 752; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 753; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 754; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 755; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] 756; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] 757; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 758; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 759; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 760; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] 761; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%rax) 762; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rax) 763; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rax) 764; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) 765; AVX512DQ-NEXT: vzeroupper 766; AVX512DQ-NEXT: retq 767; 768; AVX512DQ-FCP-LABEL: store_i64_stride8_vf4: 769; AVX512DQ-FCP: # %bb.0: 770; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 771; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 772; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 773; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 774; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm1 775; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm2 776; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %ymm3 777; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 778; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 779; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 780; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 781; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] 782; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] 783; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 784; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 785; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 786; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] 787; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] 788; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 789; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 790; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 791; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 792; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] 793; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] 794; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 795; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 796; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 797; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 798; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] 799; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] 800; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 801; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 802; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 803; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] 804; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) 805; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) 806; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) 807; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 808; AVX512DQ-FCP-NEXT: vzeroupper 809; AVX512DQ-FCP-NEXT: retq 810; 811; AVX512BW-LABEL: store_i64_stride8_vf4: 812; AVX512BW: # %bb.0: 813; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 814; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 815; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 816; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 817; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 818; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 819; AVX512BW-NEXT: vmovdqa (%r11), %ymm3 820; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 821; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 822; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 823; AVX512BW-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 824; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] 825; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] 826; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 827; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 828; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 829; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] 830; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] 831; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 832; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 833; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 834; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 835; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] 836; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] 837; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 838; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 839; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 840; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 841; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] 842; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] 843; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 844; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 845; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 846; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] 847; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) 848; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) 849; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) 850; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) 851; AVX512BW-NEXT: vzeroupper 852; AVX512BW-NEXT: retq 853; 854; AVX512BW-FCP-LABEL: store_i64_stride8_vf4: 855; AVX512BW-FCP: # %bb.0: 856; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 857; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 858; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 859; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 860; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 861; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm2 862; AVX512BW-FCP-NEXT: vmovdqa (%r11), %ymm3 863; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 864; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 865; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 866; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 867; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] 868; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] 869; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 870; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 871; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 872; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] 873; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] 874; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 875; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 876; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 877; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 878; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] 879; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] 880; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 881; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 882; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 883; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 884; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] 885; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] 886; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 887; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 888; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 889; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] 890; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) 891; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) 892; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) 893; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 894; AVX512BW-FCP-NEXT: vzeroupper 895; AVX512BW-FCP-NEXT: retq 896; 897; AVX512DQ-BW-LABEL: store_i64_stride8_vf4: 898; AVX512DQ-BW: # %bb.0: 899; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 900; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 901; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 902; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 903; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 904; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm2 905; AVX512DQ-BW-NEXT: vmovdqa (%r11), %ymm3 906; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 907; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 908; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 909; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 910; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] 911; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] 912; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 913; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 914; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 915; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] 916; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] 917; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 918; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 919; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 920; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 921; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] 922; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] 923; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 924; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 925; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 926; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 927; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] 928; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] 929; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 930; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 931; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 932; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] 933; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 192(%rax) 934; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rax) 935; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rax) 936; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rax) 937; AVX512DQ-BW-NEXT: vzeroupper 938; AVX512DQ-BW-NEXT: retq 939; 940; AVX512DQ-BW-FCP-LABEL: store_i64_stride8_vf4: 941; AVX512DQ-BW-FCP: # %bb.0: 942; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 943; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 944; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 945; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 946; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 947; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm2 948; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %ymm3 949; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 950; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 951; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 952; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 953; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] 954; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] 955; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 956; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 957; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 958; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] 959; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] 960; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 961; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 962; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 963; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 964; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] 965; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] 966; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 967; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 968; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 969; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 970; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] 971; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] 972; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 973; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 974; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 975; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] 976; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) 977; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) 978; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) 979; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 980; AVX512DQ-BW-FCP-NEXT: vzeroupper 981; AVX512DQ-BW-FCP-NEXT: retq 982 %in.vec0 = load <4 x i64>, ptr %in.vecptr0, align 64 983 %in.vec1 = load <4 x i64>, ptr %in.vecptr1, align 64 984 %in.vec2 = load <4 x i64>, ptr %in.vecptr2, align 64 985 %in.vec3 = load <4 x i64>, ptr %in.vecptr3, align 64 986 %in.vec4 = load <4 x i64>, ptr %in.vecptr4, align 64 987 %in.vec5 = load <4 x i64>, ptr %in.vecptr5, align 64 988 %in.vec6 = load <4 x i64>, ptr %in.vecptr6, align 64 989 %in.vec7 = load <4 x i64>, ptr %in.vecptr7, align 64 990 %1 = shufflevector <4 x i64> %in.vec0, <4 x i64> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 991 %2 = shufflevector <4 x i64> %in.vec2, <4 x i64> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 992 %3 = shufflevector <4 x i64> %in.vec4, <4 x i64> %in.vec5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 993 %4 = shufflevector <4 x i64> %in.vec6, <4 x i64> %in.vec7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 994 %5 = shufflevector <8 x i64> %1, <8 x i64> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 995 %6 = shufflevector <8 x i64> %3, <8 x i64> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 996 %7 = shufflevector <16 x i64> %5, <16 x i64> %6, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 997 %interleaved.vec = shufflevector <32 x i64> %7, <32 x i64> poison, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 998 store <32 x i64> %interleaved.vec, ptr %out.vec, align 64 999 ret void 1000} 1001 1002define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { 1003; SSE-LABEL: store_i64_stride8_vf8: 1004; SSE: # %bb.0: 1005; SSE-NEXT: subq $152, %rsp 1006; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 1007; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 1008; SSE-NEXT: movaps (%rdi), %xmm7 1009; SSE-NEXT: movaps 16(%rdi), %xmm9 1010; SSE-NEXT: movaps (%rsi), %xmm3 1011; SSE-NEXT: movaps 16(%rsi), %xmm0 1012; SSE-NEXT: movaps (%rdx), %xmm8 1013; SSE-NEXT: movaps 16(%rdx), %xmm11 1014; SSE-NEXT: movaps (%rcx), %xmm4 1015; SSE-NEXT: movaps 16(%rcx), %xmm1 1016; SSE-NEXT: movaps (%r8), %xmm10 1017; SSE-NEXT: movaps 16(%r8), %xmm13 1018; SSE-NEXT: movaps (%r9), %xmm5 1019; SSE-NEXT: movaps 16(%r9), %xmm2 1020; SSE-NEXT: movaps (%r10), %xmm12 1021; SSE-NEXT: movaps 16(%r10), %xmm15 1022; SSE-NEXT: movaps (%rax), %xmm6 1023; SSE-NEXT: movaps %xmm7, %xmm14 1024; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm3[0] 1025; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1026; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] 1027; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1028; SSE-NEXT: movaps %xmm8, %xmm3 1029; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] 1030; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1031; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] 1032; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1033; SSE-NEXT: movaps %xmm10, %xmm4 1034; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 1035; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1036; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] 1037; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1038; SSE-NEXT: movaps %xmm12, %xmm4 1039; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] 1040; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1041; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1] 1042; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1043; SSE-NEXT: movaps %xmm9, %xmm3 1044; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] 1045; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill 1046; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] 1047; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1048; SSE-NEXT: movaps %xmm11, %xmm0 1049; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1050; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1051; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] 1052; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1053; SSE-NEXT: movaps %xmm13, %xmm0 1054; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1055; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1056; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] 1057; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1058; SSE-NEXT: movaps 16(%rax), %xmm0 1059; SSE-NEXT: movaps %xmm15, %xmm1 1060; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 1061; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1062; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] 1063; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1064; SSE-NEXT: movaps 32(%rdi), %xmm13 1065; SSE-NEXT: movaps 32(%rsi), %xmm0 1066; SSE-NEXT: movaps %xmm13, %xmm1 1067; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 1068; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1069; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] 1070; SSE-NEXT: movaps 32(%rdx), %xmm11 1071; SSE-NEXT: movaps 32(%rcx), %xmm0 1072; SSE-NEXT: movaps %xmm11, %xmm15 1073; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] 1074; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] 1075; SSE-NEXT: movaps 32(%r8), %xmm10 1076; SSE-NEXT: movaps 32(%r9), %xmm0 1077; SSE-NEXT: movaps %xmm10, %xmm14 1078; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] 1079; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] 1080; SSE-NEXT: movaps 32(%r10), %xmm8 1081; SSE-NEXT: movaps 32(%rax), %xmm1 1082; SSE-NEXT: movaps %xmm8, %xmm12 1083; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] 1084; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] 1085; SSE-NEXT: movaps 48(%rdi), %xmm6 1086; SSE-NEXT: movaps 48(%rsi), %xmm0 1087; SSE-NEXT: movaps %xmm6, %xmm9 1088; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] 1089; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] 1090; SSE-NEXT: movaps 48(%rdx), %xmm5 1091; SSE-NEXT: movaps 48(%rcx), %xmm1 1092; SSE-NEXT: movaps %xmm5, %xmm7 1093; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] 1094; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 1095; SSE-NEXT: movaps 48(%r8), %xmm1 1096; SSE-NEXT: movaps 48(%r9), %xmm2 1097; SSE-NEXT: movaps %xmm1, %xmm4 1098; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] 1099; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 1100; SSE-NEXT: movaps 48(%r10), %xmm2 1101; SSE-NEXT: movaps 48(%rax), %xmm3 1102; SSE-NEXT: movaps %xmm2, %xmm0 1103; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 1104; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] 1105; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 1106; SSE-NEXT: movaps %xmm2, 496(%rax) 1107; SSE-NEXT: movaps %xmm1, 480(%rax) 1108; SSE-NEXT: movaps %xmm5, 464(%rax) 1109; SSE-NEXT: movaps %xmm6, 448(%rax) 1110; SSE-NEXT: movaps %xmm0, 432(%rax) 1111; SSE-NEXT: movaps %xmm4, 416(%rax) 1112; SSE-NEXT: movaps %xmm7, 400(%rax) 1113; SSE-NEXT: movaps %xmm9, 384(%rax) 1114; SSE-NEXT: movaps %xmm8, 368(%rax) 1115; SSE-NEXT: movaps %xmm10, 352(%rax) 1116; SSE-NEXT: movaps %xmm11, 336(%rax) 1117; SSE-NEXT: movaps %xmm13, 320(%rax) 1118; SSE-NEXT: movaps %xmm12, 304(%rax) 1119; SSE-NEXT: movaps %xmm14, 288(%rax) 1120; SSE-NEXT: movaps %xmm15, 272(%rax) 1121; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1122; SSE-NEXT: movaps %xmm0, 256(%rax) 1123; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1124; SSE-NEXT: movaps %xmm0, 240(%rax) 1125; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1126; SSE-NEXT: movaps %xmm0, 224(%rax) 1127; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1128; SSE-NEXT: movaps %xmm0, 208(%rax) 1129; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1130; SSE-NEXT: movaps %xmm0, 192(%rax) 1131; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1132; SSE-NEXT: movaps %xmm0, 176(%rax) 1133; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1134; SSE-NEXT: movaps %xmm0, 160(%rax) 1135; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1136; SSE-NEXT: movaps %xmm0, 144(%rax) 1137; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 1138; SSE-NEXT: movaps %xmm0, 128(%rax) 1139; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1140; SSE-NEXT: movaps %xmm0, 112(%rax) 1141; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1142; SSE-NEXT: movaps %xmm0, 96(%rax) 1143; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1144; SSE-NEXT: movaps %xmm0, 80(%rax) 1145; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1146; SSE-NEXT: movaps %xmm0, 64(%rax) 1147; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1148; SSE-NEXT: movaps %xmm0, 48(%rax) 1149; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1150; SSE-NEXT: movaps %xmm0, 32(%rax) 1151; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1152; SSE-NEXT: movaps %xmm0, 16(%rax) 1153; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1154; SSE-NEXT: movaps %xmm0, (%rax) 1155; SSE-NEXT: addq $152, %rsp 1156; SSE-NEXT: retq 1157; 1158; AVX-LABEL: store_i64_stride8_vf8: 1159; AVX: # %bb.0: 1160; AVX-NEXT: pushq %rax 1161; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 1162; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 1163; AVX-NEXT: vmovaps 32(%r9), %xmm1 1164; AVX-NEXT: vmovaps 32(%r8), %xmm2 1165; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm1[0] 1166; AVX-NEXT: vmovaps (%rax), %xmm5 1167; AVX-NEXT: vmovaps 32(%rax), %xmm3 1168; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 1169; AVX-NEXT: vinsertf128 $1, 32(%r10), %ymm0, %ymm0 1170; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[2] 1171; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1172; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 1173; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm2 1174; AVX-NEXT: vbroadcastsd 40(%r10), %ymm3 1175; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 1176; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] 1177; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1178; AVX-NEXT: vmovaps 32(%rsi), %xmm3 1179; AVX-NEXT: vmovaps 32(%rdi), %xmm4 1180; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm3[0] 1181; AVX-NEXT: vmovaps (%rcx), %xmm7 1182; AVX-NEXT: vmovaps 32(%rcx), %xmm6 1183; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm8 1184; AVX-NEXT: vinsertf128 $1, 32(%rdx), %ymm2, %ymm2 1185; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm8[1],ymm2[2],ymm8[2] 1186; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1187; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] 1188; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm4 1189; AVX-NEXT: vbroadcastsd 40(%rdx), %ymm6 1190; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] 1191; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] 1192; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1193; AVX-NEXT: vmovaps (%r9), %xmm6 1194; AVX-NEXT: vmovaps (%r8), %xmm8 1195; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm6[0] 1196; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm9 1197; AVX-NEXT: vinsertf128 $1, (%r10), %ymm4, %ymm4 1198; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2],ymm9[2] 1199; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm6[1] 1200; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 1201; AVX-NEXT: vbroadcastsd 8(%r10), %ymm8 1202; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] 1203; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] 1204; AVX-NEXT: vmovaps (%rsi), %xmm8 1205; AVX-NEXT: vmovaps (%rdi), %xmm9 1206; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm9[0],xmm8[0] 1207; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm10 1208; AVX-NEXT: vinsertf128 $1, (%rdx), %ymm6, %ymm6 1209; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2],ymm10[2] 1210; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1] 1211; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 1212; AVX-NEXT: vbroadcastsd 8(%rdx), %ymm9 1213; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] 1214; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] 1215; AVX-NEXT: vmovaps 16(%r9), %xmm10 1216; AVX-NEXT: vmovaps 16(%r8), %xmm11 1217; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm11[0],xmm10[0] 1218; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] 1219; AVX-NEXT: vbroadcastsd 16(%rax), %ymm9 1220; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] 1221; AVX-NEXT: vmovaps 16(%rsi), %xmm12 1222; AVX-NEXT: vmovaps 16(%rdi), %xmm13 1223; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm12[0] 1224; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] 1225; AVX-NEXT: vbroadcastsd 16(%rcx), %ymm14 1226; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] 1227; AVX-NEXT: vmovaps 48(%r9), %xmm14 1228; AVX-NEXT: vmovaps 48(%r8), %xmm15 1229; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm14[1] 1230; AVX-NEXT: vbroadcastsd 56(%r10), %ymm1 1231; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1232; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm14[0] 1233; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] 1234; AVX-NEXT: vbroadcastsd 48(%rax), %ymm14 1235; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm14[6,7] 1236; AVX-NEXT: vmovaps 48(%rsi), %xmm1 1237; AVX-NEXT: vmovaps 48(%rdi), %xmm15 1238; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm1[1] 1239; AVX-NEXT: vbroadcastsd 56(%rdx), %ymm3 1240; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 1241; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm1[0] 1242; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] 1243; AVX-NEXT: vbroadcastsd 48(%rcx), %ymm3 1244; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] 1245; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm11[1],xmm10[1] 1246; AVX-NEXT: vbroadcastsd 24(%r10), %ymm10 1247; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] 1248; AVX-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm13[1],xmm12[1] 1249; AVX-NEXT: vbroadcastsd 24(%rdx), %ymm11 1250; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] 1251; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdx 1252; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] 1253; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 1254; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] 1255; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] 1256; AVX-NEXT: vmovaps %ymm10, 192(%rdx) 1257; AVX-NEXT: vmovaps %ymm3, 224(%rdx) 1258; AVX-NEXT: vmovaps %ymm8, 64(%rdx) 1259; AVX-NEXT: vmovapd %ymm6, (%rdx) 1260; AVX-NEXT: vmovaps %ymm1, 384(%rdx) 1261; AVX-NEXT: vmovaps %ymm0, 448(%rdx) 1262; AVX-NEXT: vmovaps %ymm5, 96(%rdx) 1263; AVX-NEXT: vmovapd %ymm4, 32(%rdx) 1264; AVX-NEXT: vmovaps %ymm14, 416(%rdx) 1265; AVX-NEXT: vmovaps %ymm2, 480(%rdx) 1266; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1267; AVX-NEXT: vmovaps %ymm0, 320(%rdx) 1268; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1269; AVX-NEXT: vmovaps %ymm0, 256(%rdx) 1270; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1271; AVX-NEXT: vmovaps %ymm0, 352(%rdx) 1272; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1273; AVX-NEXT: vmovaps %ymm0, 288(%rdx) 1274; AVX-NEXT: vmovaps %ymm9, 128(%rdx) 1275; AVX-NEXT: vmovaps %ymm7, 160(%rdx) 1276; AVX-NEXT: popq %rax 1277; AVX-NEXT: vzeroupper 1278; AVX-NEXT: retq 1279; 1280; AVX2-LABEL: store_i64_stride8_vf8: 1281; AVX2: # %bb.0: 1282; AVX2-NEXT: pushq %rax 1283; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 1284; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 1285; AVX2-NEXT: vmovaps (%rax), %xmm6 1286; AVX2-NEXT: vmovaps 32(%rax), %xmm3 1287; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 1288; AVX2-NEXT: vmovaps (%r9), %xmm9 1289; AVX2-NEXT: vmovaps 32(%r9), %xmm7 1290; AVX2-NEXT: vmovaps (%r8), %xmm10 1291; AVX2-NEXT: vmovaps 32(%r8), %xmm8 1292; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm7[1] 1293; AVX2-NEXT: vbroadcastsd 40(%r10), %ymm2 1294; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 1295; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 1296; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1297; AVX2-NEXT: vmovaps 32(%rcx), %xmm5 1298; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 1299; AVX2-NEXT: vmovaps (%rsi), %xmm11 1300; AVX2-NEXT: vmovaps 32(%rsi), %xmm13 1301; AVX2-NEXT: vmovaps (%rdi), %xmm12 1302; AVX2-NEXT: vmovaps 32(%rdi), %xmm14 1303; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm13[1] 1304; AVX2-NEXT: vbroadcastsd 40(%rdx), %ymm4 1305; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 1306; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm1[6,7] 1307; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1308; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 1309; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm10[1],xmm9[1] 1310; AVX2-NEXT: vbroadcastsd 8(%r10), %ymm15 1311; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] 1312; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] 1313; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1314; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] 1315; AVX2-NEXT: vbroadcastsd 8(%rdx), %ymm15 1316; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] 1317; AVX2-NEXT: vmovaps (%rcx), %xmm15 1318; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 1319; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm0[6,7] 1320; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm7[0] 1321; AVX2-NEXT: vmovaps (%rdi), %ymm7 1322; AVX2-NEXT: vinsertf128 $1, 32(%r10), %ymm0, %ymm0 1323; AVX2-NEXT: vbroadcastsd %xmm3, %ymm3 1324; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] 1325; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1326; AVX2-NEXT: vmovaps (%rsi), %ymm8 1327; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm13[0] 1328; AVX2-NEXT: vmovaps 32(%r8), %ymm1 1329; AVX2-NEXT: vinsertf128 $1, 32(%rdx), %ymm0, %ymm0 1330; AVX2-NEXT: vbroadcastsd %xmm5, %ymm5 1331; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7] 1332; AVX2-NEXT: vmovaps (%r8), %ymm13 1333; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm10[0],xmm9[0] 1334; AVX2-NEXT: vmovaps (%r9), %ymm14 1335; AVX2-NEXT: vinsertf128 $1, (%r10), %ymm0, %ymm0 1336; AVX2-NEXT: vbroadcastsd %xmm6, %ymm6 1337; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] 1338; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm11[0] 1339; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 1340; AVX2-NEXT: vbroadcastsd %xmm15, %ymm9 1341; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] 1342; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] 1343; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 1344; AVX2-NEXT: vbroadcastsd 16(%rax), %ymm10 1345; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm10[6,7] 1346; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] 1347; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 1348; AVX2-NEXT: vbroadcastsd 16(%rcx), %ymm11 1349; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] 1350; AVX2-NEXT: vmovaps 32(%r9), %ymm0 1351; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 1352; AVX2-NEXT: vbroadcastsd 56(%r10), %ymm15 1353; AVX2-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] 1354; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 1355; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 1356; AVX2-NEXT: vbroadcastsd 48(%rax), %ymm1 1357; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] 1358; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 1359; AVX2-NEXT: vmovaps 32(%rsi), %ymm15 1360; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] 1361; AVX2-NEXT: vbroadcastsd 56(%rdx), %ymm3 1362; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] 1363; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] 1364; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] 1365; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm3 1366; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] 1367; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] 1368; AVX2-NEXT: vbroadcastsd 24(%r10), %ymm13 1369; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] 1370; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] 1371; AVX2-NEXT: vbroadcastsd 24(%rdx), %ymm8 1372; AVX2-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] 1373; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx 1374; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],mem[6,7] 1375; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 1376; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] 1377; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] 1378; AVX2-NEXT: vmovaps %ymm7, 192(%rdx) 1379; AVX2-NEXT: vmovaps %ymm3, 224(%rdx) 1380; AVX2-NEXT: vmovaps %ymm1, 384(%rdx) 1381; AVX2-NEXT: vmovaps %ymm0, 448(%rdx) 1382; AVX2-NEXT: vmovaps %ymm2, 416(%rdx) 1383; AVX2-NEXT: vmovaps %ymm8, 480(%rdx) 1384; AVX2-NEXT: vmovaps %ymm11, 128(%rdx) 1385; AVX2-NEXT: vmovaps %ymm10, 160(%rdx) 1386; AVX2-NEXT: vmovaps %ymm4, 64(%rdx) 1387; AVX2-NEXT: vmovaps %ymm9, (%rdx) 1388; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1389; AVX2-NEXT: vmovaps %ymm0, 96(%rdx) 1390; AVX2-NEXT: vmovaps %ymm6, 32(%rdx) 1391; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1392; AVX2-NEXT: vmovaps %ymm0, 320(%rdx) 1393; AVX2-NEXT: vmovaps %ymm5, 256(%rdx) 1394; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1395; AVX2-NEXT: vmovaps %ymm0, 352(%rdx) 1396; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1397; AVX2-NEXT: vmovaps %ymm0, 288(%rdx) 1398; AVX2-NEXT: popq %rax 1399; AVX2-NEXT: vzeroupper 1400; AVX2-NEXT: retq 1401; 1402; AVX2-FP-LABEL: store_i64_stride8_vf8: 1403; AVX2-FP: # %bb.0: 1404; AVX2-FP-NEXT: pushq %rax 1405; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1406; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1407; AVX2-FP-NEXT: vmovaps (%rax), %xmm6 1408; AVX2-FP-NEXT: vmovaps 32(%rax), %xmm3 1409; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 1410; AVX2-FP-NEXT: vmovaps (%r9), %xmm9 1411; AVX2-FP-NEXT: vmovaps 32(%r9), %xmm7 1412; AVX2-FP-NEXT: vmovaps (%r8), %xmm10 1413; AVX2-FP-NEXT: vmovaps 32(%r8), %xmm8 1414; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm7[1] 1415; AVX2-FP-NEXT: vbroadcastsd 40(%r10), %ymm2 1416; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 1417; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 1418; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1419; AVX2-FP-NEXT: vmovaps 32(%rcx), %xmm5 1420; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 1421; AVX2-FP-NEXT: vmovaps (%rsi), %xmm11 1422; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm13 1423; AVX2-FP-NEXT: vmovaps (%rdi), %xmm12 1424; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm14 1425; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm13[1] 1426; AVX2-FP-NEXT: vbroadcastsd 40(%rdx), %ymm4 1427; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 1428; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm1[6,7] 1429; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1430; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 1431; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm10[1],xmm9[1] 1432; AVX2-FP-NEXT: vbroadcastsd 8(%r10), %ymm15 1433; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] 1434; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] 1435; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1436; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] 1437; AVX2-FP-NEXT: vbroadcastsd 8(%rdx), %ymm15 1438; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] 1439; AVX2-FP-NEXT: vmovaps (%rcx), %xmm15 1440; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 1441; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm0[6,7] 1442; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm7[0] 1443; AVX2-FP-NEXT: vmovaps (%rdi), %ymm7 1444; AVX2-FP-NEXT: vinsertf128 $1, 32(%r10), %ymm0, %ymm0 1445; AVX2-FP-NEXT: vbroadcastsd %xmm3, %ymm3 1446; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] 1447; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1448; AVX2-FP-NEXT: vmovaps (%rsi), %ymm8 1449; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm13[0] 1450; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm1 1451; AVX2-FP-NEXT: vinsertf128 $1, 32(%rdx), %ymm0, %ymm0 1452; AVX2-FP-NEXT: vbroadcastsd %xmm5, %ymm5 1453; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7] 1454; AVX2-FP-NEXT: vmovaps (%r8), %ymm13 1455; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm10[0],xmm9[0] 1456; AVX2-FP-NEXT: vmovaps (%r9), %ymm14 1457; AVX2-FP-NEXT: vinsertf128 $1, (%r10), %ymm0, %ymm0 1458; AVX2-FP-NEXT: vbroadcastsd %xmm6, %ymm6 1459; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] 1460; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm11[0] 1461; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 1462; AVX2-FP-NEXT: vbroadcastsd %xmm15, %ymm9 1463; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] 1464; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] 1465; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 1466; AVX2-FP-NEXT: vbroadcastsd 16(%rax), %ymm10 1467; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm10[6,7] 1468; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] 1469; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 1470; AVX2-FP-NEXT: vbroadcastsd 16(%rcx), %ymm11 1471; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] 1472; AVX2-FP-NEXT: vmovaps 32(%r9), %ymm0 1473; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 1474; AVX2-FP-NEXT: vbroadcastsd 56(%r10), %ymm15 1475; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] 1476; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 1477; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 1478; AVX2-FP-NEXT: vbroadcastsd 48(%rax), %ymm1 1479; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] 1480; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1 1481; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm15 1482; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] 1483; AVX2-FP-NEXT: vbroadcastsd 56(%rdx), %ymm3 1484; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] 1485; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] 1486; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] 1487; AVX2-FP-NEXT: vbroadcastsd 48(%rcx), %ymm3 1488; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] 1489; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] 1490; AVX2-FP-NEXT: vbroadcastsd 24(%r10), %ymm13 1491; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] 1492; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] 1493; AVX2-FP-NEXT: vbroadcastsd 24(%rdx), %ymm8 1494; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] 1495; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rdx 1496; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],mem[6,7] 1497; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 1498; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] 1499; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] 1500; AVX2-FP-NEXT: vmovaps %ymm7, 192(%rdx) 1501; AVX2-FP-NEXT: vmovaps %ymm3, 224(%rdx) 1502; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rdx) 1503; AVX2-FP-NEXT: vmovaps %ymm0, 448(%rdx) 1504; AVX2-FP-NEXT: vmovaps %ymm2, 416(%rdx) 1505; AVX2-FP-NEXT: vmovaps %ymm8, 480(%rdx) 1506; AVX2-FP-NEXT: vmovaps %ymm11, 128(%rdx) 1507; AVX2-FP-NEXT: vmovaps %ymm10, 160(%rdx) 1508; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rdx) 1509; AVX2-FP-NEXT: vmovaps %ymm9, (%rdx) 1510; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1511; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rdx) 1512; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rdx) 1513; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1514; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rdx) 1515; AVX2-FP-NEXT: vmovaps %ymm5, 256(%rdx) 1516; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1517; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rdx) 1518; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1519; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rdx) 1520; AVX2-FP-NEXT: popq %rax 1521; AVX2-FP-NEXT: vzeroupper 1522; AVX2-FP-NEXT: retq 1523; 1524; AVX2-FCP-LABEL: store_i64_stride8_vf8: 1525; AVX2-FCP: # %bb.0: 1526; AVX2-FCP-NEXT: pushq %rax 1527; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1528; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1529; AVX2-FCP-NEXT: vmovaps (%rax), %xmm6 1530; AVX2-FCP-NEXT: vmovaps 32(%rax), %xmm3 1531; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 1532; AVX2-FCP-NEXT: vmovaps (%r9), %xmm9 1533; AVX2-FCP-NEXT: vmovaps 32(%r9), %xmm7 1534; AVX2-FCP-NEXT: vmovaps (%r8), %xmm10 1535; AVX2-FCP-NEXT: vmovaps 32(%r8), %xmm8 1536; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm7[1] 1537; AVX2-FCP-NEXT: vbroadcastsd 40(%r10), %ymm2 1538; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 1539; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 1540; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1541; AVX2-FCP-NEXT: vmovaps 32(%rcx), %xmm5 1542; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 1543; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm11 1544; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm13 1545; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm12 1546; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm14 1547; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm13[1] 1548; AVX2-FCP-NEXT: vbroadcastsd 40(%rdx), %ymm4 1549; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 1550; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm1[6,7] 1551; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1552; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 1553; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm10[1],xmm9[1] 1554; AVX2-FCP-NEXT: vbroadcastsd 8(%r10), %ymm15 1555; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] 1556; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] 1557; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1558; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] 1559; AVX2-FCP-NEXT: vbroadcastsd 8(%rdx), %ymm15 1560; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] 1561; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm15 1562; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 1563; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm0[6,7] 1564; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm7[0] 1565; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm7 1566; AVX2-FCP-NEXT: vinsertf128 $1, 32(%r10), %ymm0, %ymm0 1567; AVX2-FCP-NEXT: vbroadcastsd %xmm3, %ymm3 1568; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] 1569; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1570; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm8 1571; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm13[0] 1572; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm1 1573; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rdx), %ymm0, %ymm0 1574; AVX2-FCP-NEXT: vbroadcastsd %xmm5, %ymm5 1575; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7] 1576; AVX2-FCP-NEXT: vmovaps (%r8), %ymm13 1577; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm10[0],xmm9[0] 1578; AVX2-FCP-NEXT: vmovaps (%r9), %ymm14 1579; AVX2-FCP-NEXT: vinsertf128 $1, (%r10), %ymm0, %ymm0 1580; AVX2-FCP-NEXT: vbroadcastsd %xmm6, %ymm6 1581; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] 1582; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm11[0] 1583; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 1584; AVX2-FCP-NEXT: vbroadcastsd %xmm15, %ymm9 1585; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] 1586; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] 1587; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 1588; AVX2-FCP-NEXT: vbroadcastsd 16(%rax), %ymm10 1589; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm10[6,7] 1590; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] 1591; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 1592; AVX2-FCP-NEXT: vbroadcastsd 16(%rcx), %ymm11 1593; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] 1594; AVX2-FCP-NEXT: vmovaps 32(%r9), %ymm0 1595; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 1596; AVX2-FCP-NEXT: vbroadcastsd 56(%r10), %ymm15 1597; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] 1598; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 1599; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 1600; AVX2-FCP-NEXT: vbroadcastsd 48(%rax), %ymm1 1601; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] 1602; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 1603; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm15 1604; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] 1605; AVX2-FCP-NEXT: vbroadcastsd 56(%rdx), %ymm3 1606; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] 1607; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] 1608; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] 1609; AVX2-FCP-NEXT: vbroadcastsd 48(%rcx), %ymm3 1610; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] 1611; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] 1612; AVX2-FCP-NEXT: vbroadcastsd 24(%r10), %ymm13 1613; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] 1614; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] 1615; AVX2-FCP-NEXT: vbroadcastsd 24(%rdx), %ymm8 1616; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] 1617; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdx 1618; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],mem[6,7] 1619; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 1620; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] 1621; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] 1622; AVX2-FCP-NEXT: vmovaps %ymm7, 192(%rdx) 1623; AVX2-FCP-NEXT: vmovaps %ymm3, 224(%rdx) 1624; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rdx) 1625; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%rdx) 1626; AVX2-FCP-NEXT: vmovaps %ymm2, 416(%rdx) 1627; AVX2-FCP-NEXT: vmovaps %ymm8, 480(%rdx) 1628; AVX2-FCP-NEXT: vmovaps %ymm11, 128(%rdx) 1629; AVX2-FCP-NEXT: vmovaps %ymm10, 160(%rdx) 1630; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rdx) 1631; AVX2-FCP-NEXT: vmovaps %ymm9, (%rdx) 1632; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1633; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rdx) 1634; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rdx) 1635; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1636; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rdx) 1637; AVX2-FCP-NEXT: vmovaps %ymm5, 256(%rdx) 1638; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1639; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%rdx) 1640; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1641; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rdx) 1642; AVX2-FCP-NEXT: popq %rax 1643; AVX2-FCP-NEXT: vzeroupper 1644; AVX2-FCP-NEXT: retq 1645; 1646; AVX512-LABEL: store_i64_stride8_vf8: 1647; AVX512: # %bb.0: 1648; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 1649; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 1650; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 1651; AVX512-NEXT: vmovdqa64 (%rdi), %zmm6 1652; AVX512-NEXT: vmovdqa64 (%rsi), %zmm8 1653; AVX512-NEXT: vmovdqa64 (%rdx), %zmm7 1654; AVX512-NEXT: vmovdqa64 (%rcx), %zmm11 1655; AVX512-NEXT: vmovdqa64 (%r8), %zmm0 1656; AVX512-NEXT: vmovdqa64 (%r9), %zmm2 1657; AVX512-NEXT: vmovdqa64 (%r11), %zmm1 1658; AVX512-NEXT: vmovdqa64 (%r10), %zmm3 1659; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] 1660; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1661; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 1662; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 1663; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 1664; AVX512-NEXT: movb $-64, %r8b 1665; AVX512-NEXT: kmovw %r8d, %k1 1666; AVX512-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} 1667; AVX512-NEXT: vmovdqa (%rsi), %xmm5 1668; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 1669; AVX512-NEXT: vmovdqa (%rdi), %xmm5 1670; AVX512-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 1671; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] 1672; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 1673; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] 1674; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1675; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 1676; AVX512-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 1677; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 1678; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 1679; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] 1680; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 1681; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] 1682; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 1683; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] 1684; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1685; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 1686; AVX512-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 1687; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 1688; AVX512-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 1689; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] 1690; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 1691; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] 1692; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 1693; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] 1694; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1695; AVX512-NEXT: vmovdqa64 %zmm7, %zmm14 1696; AVX512-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 1697; AVX512-NEXT: vmovdqa64 %zmm6, %zmm15 1698; AVX512-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 1699; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] 1700; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 1701; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] 1702; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} 1703; AVX512-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 1704; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] 1705; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1706; AVX512-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 1707; AVX512-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 1708; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 1709; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 1710; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] 1711; AVX512-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} 1712; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 1713; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] 1714; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1715; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 1716; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 1717; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 1718; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} 1719; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] 1720; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 1721; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] 1722; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1723; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 1724; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 1725; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 1726; AVX512-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} 1727; AVX512-NEXT: vmovdqa (%rcx), %ymm10 1728; AVX512-NEXT: vmovdqa (%rdx), %ymm11 1729; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] 1730; AVX512-NEXT: vmovdqa (%rsi), %ymm14 1731; AVX512-NEXT: vmovdqa (%rdi), %ymm15 1732; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 1733; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] 1734; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 1735; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] 1736; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1737; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 1738; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 1739; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} 1740; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] 1741; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 1742; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] 1743; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 1744; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) 1745; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rax) 1746; AVX512-NEXT: vmovdqa64 %zmm7, (%rax) 1747; AVX512-NEXT: vmovdqa64 %zmm6, 320(%rax) 1748; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rax) 1749; AVX512-NEXT: vmovdqa64 %zmm9, 448(%rax) 1750; AVX512-NEXT: vmovdqa64 %zmm5, 384(%rax) 1751; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rax) 1752; AVX512-NEXT: vzeroupper 1753; AVX512-NEXT: retq 1754; 1755; AVX512-FCP-LABEL: store_i64_stride8_vf8: 1756; AVX512-FCP: # %bb.0: 1757; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1758; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1759; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 1760; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 1761; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 1762; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 1763; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 1764; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm0 1765; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm2 1766; AVX512-FCP-NEXT: vmovdqa64 (%r11), %zmm1 1767; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm3 1768; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] 1769; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1770; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 1771; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 1772; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 1773; AVX512-FCP-NEXT: movb $-64, %r8b 1774; AVX512-FCP-NEXT: kmovw %r8d, %k1 1775; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} 1776; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm5 1777; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 1778; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm5 1779; AVX512-FCP-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 1780; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] 1781; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 1782; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] 1783; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1784; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 1785; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 1786; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 1787; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 1788; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] 1789; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 1790; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] 1791; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 1792; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] 1793; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1794; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 1795; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 1796; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 1797; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 1798; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] 1799; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 1800; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] 1801; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 1802; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] 1803; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1804; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 1805; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 1806; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 1807; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 1808; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] 1809; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 1810; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] 1811; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} 1812; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 1813; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] 1814; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1815; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 1816; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 1817; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 1818; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 1819; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] 1820; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} 1821; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 1822; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] 1823; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1824; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 1825; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 1826; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 1827; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} 1828; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] 1829; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 1830; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] 1831; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1832; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 1833; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 1834; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 1835; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} 1836; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm10 1837; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm11 1838; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] 1839; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm14 1840; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm15 1841; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 1842; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] 1843; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 1844; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] 1845; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1846; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 1847; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 1848; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} 1849; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] 1850; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 1851; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] 1852; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 1853; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) 1854; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) 1855; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rax) 1856; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 320(%rax) 1857; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) 1858; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 448(%rax) 1859; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax) 1860; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rax) 1861; AVX512-FCP-NEXT: vzeroupper 1862; AVX512-FCP-NEXT: retq 1863; 1864; AVX512DQ-LABEL: store_i64_stride8_vf8: 1865; AVX512DQ: # %bb.0: 1866; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 1867; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 1868; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 1869; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm6 1870; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm8 1871; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm7 1872; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm11 1873; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm0 1874; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm2 1875; AVX512DQ-NEXT: vmovdqa64 (%r11), %zmm1 1876; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm3 1877; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] 1878; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1879; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 1880; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 1881; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 1882; AVX512DQ-NEXT: movb $-64, %r8b 1883; AVX512DQ-NEXT: kmovw %r8d, %k1 1884; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} 1885; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm5 1886; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 1887; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm5 1888; AVX512DQ-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 1889; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] 1890; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 1891; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] 1892; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1893; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 1894; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 1895; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 1896; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 1897; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] 1898; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 1899; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] 1900; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 1901; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] 1902; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1903; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 1904; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 1905; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14 1906; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 1907; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] 1908; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 1909; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] 1910; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 1911; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] 1912; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1913; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm14 1914; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 1915; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm15 1916; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 1917; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] 1918; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 1919; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] 1920; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} 1921; AVX512DQ-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 1922; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] 1923; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1924; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 1925; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 1926; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 1927; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 1928; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] 1929; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} 1930; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 1931; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] 1932; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1933; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 1934; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 1935; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 1936; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} 1937; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] 1938; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 1939; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] 1940; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1941; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 1942; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 1943; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 1944; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} 1945; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm10 1946; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm11 1947; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] 1948; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm14 1949; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm15 1950; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 1951; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] 1952; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 1953; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] 1954; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1955; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 1956; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 1957; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} 1958; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] 1959; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 1960; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] 1961; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 1962; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%rax) 1963; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rax) 1964; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rax) 1965; AVX512DQ-NEXT: vmovdqa64 %zmm6, 320(%rax) 1966; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rax) 1967; AVX512DQ-NEXT: vmovdqa64 %zmm9, 448(%rax) 1968; AVX512DQ-NEXT: vmovdqa64 %zmm5, 384(%rax) 1969; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rax) 1970; AVX512DQ-NEXT: vzeroupper 1971; AVX512DQ-NEXT: retq 1972; 1973; AVX512DQ-FCP-LABEL: store_i64_stride8_vf8: 1974; AVX512DQ-FCP: # %bb.0: 1975; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1976; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1977; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 1978; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 1979; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 1980; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 1981; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 1982; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm0 1983; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm2 1984; AVX512DQ-FCP-NEXT: vmovdqa64 (%r11), %zmm1 1985; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm3 1986; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] 1987; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1988; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 1989; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 1990; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 1991; AVX512DQ-FCP-NEXT: movb $-64, %r8b 1992; AVX512DQ-FCP-NEXT: kmovw %r8d, %k1 1993; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} 1994; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm5 1995; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 1996; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm5 1997; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 1998; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] 1999; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 2000; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] 2001; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2002; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 2003; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 2004; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 2005; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 2006; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] 2007; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 2008; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] 2009; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 2010; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] 2011; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2012; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 2013; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 2014; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 2015; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 2016; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] 2017; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 2018; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] 2019; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 2020; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] 2021; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2022; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 2023; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 2024; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 2025; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 2026; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] 2027; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 2028; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] 2029; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} 2030; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 2031; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] 2032; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2033; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 2034; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 2035; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 2036; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 2037; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] 2038; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} 2039; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 2040; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] 2041; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2042; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 2043; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 2044; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 2045; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} 2046; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] 2047; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 2048; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] 2049; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2050; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 2051; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 2052; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 2053; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} 2054; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm10 2055; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm11 2056; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] 2057; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm14 2058; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm15 2059; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 2060; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] 2061; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 2062; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] 2063; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2064; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 2065; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 2066; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} 2067; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] 2068; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 2069; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] 2070; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 2071; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) 2072; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) 2073; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax) 2074; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 320(%rax) 2075; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) 2076; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 448(%rax) 2077; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax) 2078; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rax) 2079; AVX512DQ-FCP-NEXT: vzeroupper 2080; AVX512DQ-FCP-NEXT: retq 2081; 2082; AVX512BW-LABEL: store_i64_stride8_vf8: 2083; AVX512BW: # %bb.0: 2084; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 2085; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 2086; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 2087; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm6 2088; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm8 2089; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm7 2090; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm11 2091; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 2092; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm2 2093; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm1 2094; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm3 2095; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] 2096; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2097; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 2098; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 2099; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 2100; AVX512BW-NEXT: movb $-64, %r8b 2101; AVX512BW-NEXT: kmovd %r8d, %k1 2102; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} 2103; AVX512BW-NEXT: vmovdqa (%rsi), %xmm5 2104; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 2105; AVX512BW-NEXT: vmovdqa (%rdi), %xmm5 2106; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 2107; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] 2108; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 2109; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] 2110; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2111; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 2112; AVX512BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 2113; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 2114; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 2115; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] 2116; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 2117; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] 2118; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 2119; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] 2120; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2121; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 2122; AVX512BW-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 2123; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 2124; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 2125; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] 2126; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 2127; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] 2128; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 2129; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] 2130; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2131; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 2132; AVX512BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 2133; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 2134; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 2135; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] 2136; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 2137; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] 2138; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} 2139; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 2140; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] 2141; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2142; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 2143; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 2144; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 2145; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 2146; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] 2147; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} 2148; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 2149; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] 2150; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2151; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 2152; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 2153; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 2154; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} 2155; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] 2156; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 2157; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] 2158; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2159; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 2160; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 2161; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 2162; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} 2163; AVX512BW-NEXT: vmovdqa (%rcx), %ymm10 2164; AVX512BW-NEXT: vmovdqa (%rdx), %ymm11 2165; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] 2166; AVX512BW-NEXT: vmovdqa (%rsi), %ymm14 2167; AVX512BW-NEXT: vmovdqa (%rdi), %ymm15 2168; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 2169; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] 2170; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 2171; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] 2172; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2173; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 2174; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 2175; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} 2176; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] 2177; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 2178; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] 2179; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 2180; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) 2181; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) 2182; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) 2183; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rax) 2184; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rax) 2185; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rax) 2186; AVX512BW-NEXT: vmovdqa64 %zmm5, 384(%rax) 2187; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rax) 2188; AVX512BW-NEXT: vzeroupper 2189; AVX512BW-NEXT: retq 2190; 2191; AVX512BW-FCP-LABEL: store_i64_stride8_vf8: 2192; AVX512BW-FCP: # %bb.0: 2193; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2194; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 2195; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 2196; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 2197; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 2198; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 2199; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 2200; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 2201; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 2202; AVX512BW-FCP-NEXT: vmovdqa64 (%r11), %zmm1 2203; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm3 2204; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] 2205; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2206; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 2207; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 2208; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 2209; AVX512BW-FCP-NEXT: movb $-64, %r8b 2210; AVX512BW-FCP-NEXT: kmovd %r8d, %k1 2211; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} 2212; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm5 2213; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 2214; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm5 2215; AVX512BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 2216; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] 2217; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 2218; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] 2219; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2220; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 2221; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 2222; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 2223; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 2224; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] 2225; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 2226; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] 2227; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 2228; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] 2229; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2230; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 2231; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 2232; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 2233; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 2234; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] 2235; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 2236; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] 2237; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 2238; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] 2239; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2240; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 2241; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 2242; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 2243; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 2244; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] 2245; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 2246; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] 2247; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} 2248; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 2249; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] 2250; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2251; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 2252; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 2253; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 2254; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 2255; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] 2256; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} 2257; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 2258; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] 2259; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2260; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 2261; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 2262; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 2263; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} 2264; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] 2265; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 2266; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] 2267; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2268; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 2269; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 2270; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 2271; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} 2272; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm10 2273; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm11 2274; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] 2275; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm14 2276; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm15 2277; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 2278; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] 2279; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 2280; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] 2281; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2282; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 2283; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 2284; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} 2285; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] 2286; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 2287; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] 2288; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 2289; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) 2290; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) 2291; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) 2292; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 320(%rax) 2293; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) 2294; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 448(%rax) 2295; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax) 2296; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rax) 2297; AVX512BW-FCP-NEXT: vzeroupper 2298; AVX512BW-FCP-NEXT: retq 2299; 2300; AVX512DQ-BW-LABEL: store_i64_stride8_vf8: 2301; AVX512DQ-BW: # %bb.0: 2302; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 2303; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 2304; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 2305; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm6 2306; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm8 2307; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm7 2308; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm11 2309; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm0 2310; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm2 2311; AVX512DQ-BW-NEXT: vmovdqa64 (%r11), %zmm1 2312; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm3 2313; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] 2314; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2315; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 2316; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 2317; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 2318; AVX512DQ-BW-NEXT: movb $-64, %r8b 2319; AVX512DQ-BW-NEXT: kmovd %r8d, %k1 2320; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} 2321; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm5 2322; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 2323; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm5 2324; AVX512DQ-BW-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 2325; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] 2326; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 2327; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] 2328; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2329; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 2330; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 2331; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 2332; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 2333; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] 2334; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 2335; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] 2336; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 2337; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] 2338; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2339; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 2340; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 2341; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14 2342; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 2343; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] 2344; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 2345; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] 2346; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 2347; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] 2348; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2349; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm14 2350; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 2351; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm15 2352; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 2353; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] 2354; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 2355; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] 2356; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} 2357; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 2358; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] 2359; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2360; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 2361; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 2362; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 2363; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 2364; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] 2365; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} 2366; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 2367; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] 2368; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2369; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 2370; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 2371; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 2372; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} 2373; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] 2374; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 2375; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] 2376; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2377; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 2378; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 2379; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 2380; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} 2381; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm10 2382; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm11 2383; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] 2384; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm14 2385; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm15 2386; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 2387; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] 2388; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 2389; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] 2390; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2391; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 2392; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 2393; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} 2394; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] 2395; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 2396; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] 2397; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 2398; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 192(%rax) 2399; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 128(%rax) 2400; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rax) 2401; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 320(%rax) 2402; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 256(%rax) 2403; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 448(%rax) 2404; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 384(%rax) 2405; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 64(%rax) 2406; AVX512DQ-BW-NEXT: vzeroupper 2407; AVX512DQ-BW-NEXT: retq 2408; 2409; AVX512DQ-BW-FCP-LABEL: store_i64_stride8_vf8: 2410; AVX512DQ-BW-FCP: # %bb.0: 2411; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2412; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 2413; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 2414; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 2415; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 2416; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 2417; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 2418; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 2419; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 2420; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r11), %zmm1 2421; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm3 2422; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] 2423; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2424; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 2425; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 2426; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 2427; AVX512DQ-BW-FCP-NEXT: movb $-64, %r8b 2428; AVX512DQ-BW-FCP-NEXT: kmovd %r8d, %k1 2429; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} 2430; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm5 2431; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 2432; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm5 2433; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 2434; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] 2435; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 2436; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] 2437; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2438; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 2439; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 2440; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 2441; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 2442; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] 2443; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 2444; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] 2445; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 2446; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] 2447; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2448; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 2449; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 2450; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 2451; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 2452; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] 2453; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 2454; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] 2455; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 2456; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] 2457; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2458; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 2459; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 2460; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 2461; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 2462; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] 2463; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 2464; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] 2465; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} 2466; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 2467; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] 2468; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2469; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 2470; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 2471; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 2472; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 2473; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] 2474; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} 2475; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 2476; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] 2477; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2478; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 2479; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 2480; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 2481; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} 2482; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] 2483; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 2484; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] 2485; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2486; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 2487; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 2488; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 2489; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} 2490; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm10 2491; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm11 2492; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] 2493; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm14 2494; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm15 2495; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 2496; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] 2497; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 2498; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] 2499; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2500; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 2501; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 2502; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} 2503; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] 2504; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 2505; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] 2506; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 2507; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) 2508; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) 2509; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) 2510; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 320(%rax) 2511; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) 2512; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 448(%rax) 2513; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax) 2514; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rax) 2515; AVX512DQ-BW-FCP-NEXT: vzeroupper 2516; AVX512DQ-BW-FCP-NEXT: retq 2517 %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64 2518 %in.vec1 = load <8 x i64>, ptr %in.vecptr1, align 64 2519 %in.vec2 = load <8 x i64>, ptr %in.vecptr2, align 64 2520 %in.vec3 = load <8 x i64>, ptr %in.vecptr3, align 64 2521 %in.vec4 = load <8 x i64>, ptr %in.vecptr4, align 64 2522 %in.vec5 = load <8 x i64>, ptr %in.vecptr5, align 64 2523 %in.vec6 = load <8 x i64>, ptr %in.vecptr6, align 64 2524 %in.vec7 = load <8 x i64>, ptr %in.vecptr7, align 64 2525 %1 = shufflevector <8 x i64> %in.vec0, <8 x i64> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2526 %2 = shufflevector <8 x i64> %in.vec2, <8 x i64> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2527 %3 = shufflevector <8 x i64> %in.vec4, <8 x i64> %in.vec5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2528 %4 = shufflevector <8 x i64> %in.vec6, <8 x i64> %in.vec7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2529 %5 = shufflevector <16 x i64> %1, <16 x i64> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2530 %6 = shufflevector <16 x i64> %3, <16 x i64> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2531 %7 = shufflevector <32 x i64> %5, <32 x i64> %6, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 2532 %interleaved.vec = shufflevector <64 x i64> %7, <64 x i64> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63> 2533 store <64 x i64> %interleaved.vec, ptr %out.vec, align 64 2534 ret void 2535} 2536 2537define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { 2538; SSE-LABEL: store_i64_stride8_vf16: 2539; SSE: # %bb.0: 2540; SSE-NEXT: subq $664, %rsp # imm = 0x298 2541; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 2542; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 2543; SSE-NEXT: movaps (%rdi), %xmm7 2544; SSE-NEXT: movaps 16(%rdi), %xmm8 2545; SSE-NEXT: movaps (%rsi), %xmm2 2546; SSE-NEXT: movaps 16(%rsi), %xmm0 2547; SSE-NEXT: movaps (%rdx), %xmm9 2548; SSE-NEXT: movaps 16(%rdx), %xmm10 2549; SSE-NEXT: movaps (%rcx), %xmm4 2550; SSE-NEXT: movaps 16(%rcx), %xmm1 2551; SSE-NEXT: movaps (%r8), %xmm11 2552; SSE-NEXT: movaps 16(%r8), %xmm12 2553; SSE-NEXT: movaps (%r9), %xmm5 2554; SSE-NEXT: movaps 16(%r9), %xmm3 2555; SSE-NEXT: movaps (%r10), %xmm13 2556; SSE-NEXT: movaps 16(%r10), %xmm15 2557; SSE-NEXT: movaps (%rax), %xmm6 2558; SSE-NEXT: movaps %xmm7, %xmm14 2559; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] 2560; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2561; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] 2562; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2563; SSE-NEXT: movaps %xmm9, %xmm2 2564; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] 2565; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2566; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1] 2567; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2568; SSE-NEXT: movaps %xmm11, %xmm2 2569; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] 2570; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2571; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm5[1] 2572; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2573; SSE-NEXT: movaps %xmm13, %xmm4 2574; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] 2575; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2576; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] 2577; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2578; SSE-NEXT: movaps %xmm8, %xmm2 2579; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 2580; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2581; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] 2582; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2583; SSE-NEXT: movaps %xmm10, %xmm0 2584; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2585; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2586; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] 2587; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2588; SSE-NEXT: movaps %xmm12, %xmm0 2589; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 2590; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2591; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] 2592; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2593; SSE-NEXT: movaps 16(%rax), %xmm0 2594; SSE-NEXT: movaps %xmm15, %xmm1 2595; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2596; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2597; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] 2598; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2599; SSE-NEXT: movaps 32(%rdi), %xmm2 2600; SSE-NEXT: movaps 32(%rsi), %xmm0 2601; SSE-NEXT: movaps %xmm2, %xmm1 2602; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2603; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2604; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2605; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2606; SSE-NEXT: movaps 32(%rdx), %xmm2 2607; SSE-NEXT: movaps 32(%rcx), %xmm0 2608; SSE-NEXT: movaps %xmm2, %xmm1 2609; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2610; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2611; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2612; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2613; SSE-NEXT: movaps 32(%r8), %xmm2 2614; SSE-NEXT: movaps 32(%r9), %xmm0 2615; SSE-NEXT: movaps %xmm2, %xmm1 2616; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2617; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2618; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2619; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2620; SSE-NEXT: movaps 32(%r10), %xmm2 2621; SSE-NEXT: movaps 32(%rax), %xmm0 2622; SSE-NEXT: movaps %xmm2, %xmm1 2623; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2624; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2625; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2626; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2627; SSE-NEXT: movaps 48(%rdi), %xmm2 2628; SSE-NEXT: movaps 48(%rsi), %xmm0 2629; SSE-NEXT: movaps %xmm2, %xmm1 2630; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2631; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2632; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2633; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2634; SSE-NEXT: movaps 48(%rdx), %xmm2 2635; SSE-NEXT: movaps 48(%rcx), %xmm0 2636; SSE-NEXT: movaps %xmm2, %xmm1 2637; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2638; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2639; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2640; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2641; SSE-NEXT: movaps 48(%r8), %xmm2 2642; SSE-NEXT: movaps 48(%r9), %xmm0 2643; SSE-NEXT: movaps %xmm2, %xmm1 2644; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2645; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2646; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2647; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2648; SSE-NEXT: movaps 48(%r10), %xmm2 2649; SSE-NEXT: movaps 48(%rax), %xmm0 2650; SSE-NEXT: movaps %xmm2, %xmm1 2651; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2652; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2653; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2654; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2655; SSE-NEXT: movaps 64(%rdi), %xmm2 2656; SSE-NEXT: movaps 64(%rsi), %xmm0 2657; SSE-NEXT: movaps %xmm2, %xmm1 2658; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2659; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2660; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2661; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2662; SSE-NEXT: movaps 64(%rdx), %xmm2 2663; SSE-NEXT: movaps 64(%rcx), %xmm0 2664; SSE-NEXT: movaps %xmm2, %xmm1 2665; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2666; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2667; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2668; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2669; SSE-NEXT: movaps 64(%r8), %xmm2 2670; SSE-NEXT: movaps 64(%r9), %xmm0 2671; SSE-NEXT: movaps %xmm2, %xmm1 2672; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2673; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2674; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2675; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2676; SSE-NEXT: movaps 64(%r10), %xmm2 2677; SSE-NEXT: movaps 64(%rax), %xmm0 2678; SSE-NEXT: movaps %xmm2, %xmm1 2679; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2680; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2681; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2682; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2683; SSE-NEXT: movaps 80(%rdi), %xmm2 2684; SSE-NEXT: movaps 80(%rsi), %xmm0 2685; SSE-NEXT: movaps %xmm2, %xmm1 2686; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2687; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill 2688; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2689; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2690; SSE-NEXT: movaps 80(%rdx), %xmm2 2691; SSE-NEXT: movaps 80(%rcx), %xmm0 2692; SSE-NEXT: movaps %xmm2, %xmm1 2693; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2694; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2695; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2696; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2697; SSE-NEXT: movaps 80(%r8), %xmm2 2698; SSE-NEXT: movaps 80(%r9), %xmm0 2699; SSE-NEXT: movaps %xmm2, %xmm1 2700; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2701; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2702; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2703; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2704; SSE-NEXT: movaps 80(%r10), %xmm2 2705; SSE-NEXT: movaps 80(%rax), %xmm0 2706; SSE-NEXT: movaps %xmm2, %xmm1 2707; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2708; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2709; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2710; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2711; SSE-NEXT: movaps 96(%rdi), %xmm13 2712; SSE-NEXT: movaps 96(%rsi), %xmm0 2713; SSE-NEXT: movaps %xmm13, %xmm1 2714; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2715; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2716; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] 2717; SSE-NEXT: movaps 96(%rdx), %xmm10 2718; SSE-NEXT: movaps 96(%rcx), %xmm0 2719; SSE-NEXT: movaps %xmm10, %xmm15 2720; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] 2721; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] 2722; SSE-NEXT: movaps 96(%r8), %xmm11 2723; SSE-NEXT: movaps 96(%r9), %xmm0 2724; SSE-NEXT: movaps %xmm11, %xmm14 2725; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] 2726; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] 2727; SSE-NEXT: movaps 96(%r10), %xmm9 2728; SSE-NEXT: movaps 96(%rax), %xmm0 2729; SSE-NEXT: movaps %xmm9, %xmm12 2730; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] 2731; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] 2732; SSE-NEXT: movaps 112(%rdi), %xmm7 2733; SSE-NEXT: movaps 112(%rsi), %xmm0 2734; SSE-NEXT: movaps %xmm7, %xmm8 2735; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] 2736; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] 2737; SSE-NEXT: movaps 112(%rdx), %xmm5 2738; SSE-NEXT: movaps 112(%rcx), %xmm1 2739; SSE-NEXT: movaps %xmm5, %xmm6 2740; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] 2741; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 2742; SSE-NEXT: movaps 112(%r8), %xmm1 2743; SSE-NEXT: movaps 112(%r9), %xmm2 2744; SSE-NEXT: movaps %xmm1, %xmm4 2745; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] 2746; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 2747; SSE-NEXT: movaps 112(%r10), %xmm2 2748; SSE-NEXT: movaps 112(%rax), %xmm3 2749; SSE-NEXT: movaps %xmm2, %xmm0 2750; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 2751; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] 2752; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 2753; SSE-NEXT: movaps %xmm2, 1008(%rax) 2754; SSE-NEXT: movaps %xmm1, 992(%rax) 2755; SSE-NEXT: movaps %xmm5, 976(%rax) 2756; SSE-NEXT: movaps %xmm7, 960(%rax) 2757; SSE-NEXT: movaps %xmm0, 944(%rax) 2758; SSE-NEXT: movaps %xmm4, 928(%rax) 2759; SSE-NEXT: movaps %xmm6, 912(%rax) 2760; SSE-NEXT: movaps %xmm8, 896(%rax) 2761; SSE-NEXT: movaps %xmm9, 880(%rax) 2762; SSE-NEXT: movaps %xmm11, 864(%rax) 2763; SSE-NEXT: movaps %xmm10, 848(%rax) 2764; SSE-NEXT: movaps %xmm13, 832(%rax) 2765; SSE-NEXT: movaps %xmm12, 816(%rax) 2766; SSE-NEXT: movaps %xmm14, 800(%rax) 2767; SSE-NEXT: movaps %xmm15, 784(%rax) 2768; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2769; SSE-NEXT: movaps %xmm0, 768(%rax) 2770; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2771; SSE-NEXT: movaps %xmm0, 752(%rax) 2772; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2773; SSE-NEXT: movaps %xmm0, 736(%rax) 2774; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2775; SSE-NEXT: movaps %xmm0, 720(%rax) 2776; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2777; SSE-NEXT: movaps %xmm0, 704(%rax) 2778; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2779; SSE-NEXT: movaps %xmm0, 688(%rax) 2780; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2781; SSE-NEXT: movaps %xmm0, 672(%rax) 2782; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2783; SSE-NEXT: movaps %xmm0, 656(%rax) 2784; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 2785; SSE-NEXT: movaps %xmm0, 640(%rax) 2786; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2787; SSE-NEXT: movaps %xmm0, 624(%rax) 2788; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2789; SSE-NEXT: movaps %xmm0, 608(%rax) 2790; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2791; SSE-NEXT: movaps %xmm0, 592(%rax) 2792; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2793; SSE-NEXT: movaps %xmm0, 576(%rax) 2794; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2795; SSE-NEXT: movaps %xmm0, 560(%rax) 2796; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2797; SSE-NEXT: movaps %xmm0, 544(%rax) 2798; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2799; SSE-NEXT: movaps %xmm0, 528(%rax) 2800; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2801; SSE-NEXT: movaps %xmm0, 512(%rax) 2802; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2803; SSE-NEXT: movaps %xmm0, 496(%rax) 2804; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2805; SSE-NEXT: movaps %xmm0, 480(%rax) 2806; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2807; SSE-NEXT: movaps %xmm0, 464(%rax) 2808; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2809; SSE-NEXT: movaps %xmm0, 448(%rax) 2810; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2811; SSE-NEXT: movaps %xmm0, 432(%rax) 2812; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2813; SSE-NEXT: movaps %xmm0, 416(%rax) 2814; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2815; SSE-NEXT: movaps %xmm0, 400(%rax) 2816; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2817; SSE-NEXT: movaps %xmm0, 384(%rax) 2818; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2819; SSE-NEXT: movaps %xmm0, 368(%rax) 2820; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2821; SSE-NEXT: movaps %xmm0, 352(%rax) 2822; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2823; SSE-NEXT: movaps %xmm0, 336(%rax) 2824; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2825; SSE-NEXT: movaps %xmm0, 320(%rax) 2826; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2827; SSE-NEXT: movaps %xmm0, 304(%rax) 2828; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2829; SSE-NEXT: movaps %xmm0, 288(%rax) 2830; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2831; SSE-NEXT: movaps %xmm0, 272(%rax) 2832; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2833; SSE-NEXT: movaps %xmm0, 256(%rax) 2834; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2835; SSE-NEXT: movaps %xmm0, 240(%rax) 2836; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2837; SSE-NEXT: movaps %xmm0, 224(%rax) 2838; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2839; SSE-NEXT: movaps %xmm0, 208(%rax) 2840; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2841; SSE-NEXT: movaps %xmm0, 192(%rax) 2842; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2843; SSE-NEXT: movaps %xmm0, 176(%rax) 2844; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2845; SSE-NEXT: movaps %xmm0, 160(%rax) 2846; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2847; SSE-NEXT: movaps %xmm0, 144(%rax) 2848; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2849; SSE-NEXT: movaps %xmm0, 128(%rax) 2850; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2851; SSE-NEXT: movaps %xmm0, 112(%rax) 2852; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2853; SSE-NEXT: movaps %xmm0, 96(%rax) 2854; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2855; SSE-NEXT: movaps %xmm0, 80(%rax) 2856; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2857; SSE-NEXT: movaps %xmm0, 64(%rax) 2858; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2859; SSE-NEXT: movaps %xmm0, 48(%rax) 2860; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2861; SSE-NEXT: movaps %xmm0, 32(%rax) 2862; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2863; SSE-NEXT: movaps %xmm0, 16(%rax) 2864; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2865; SSE-NEXT: movaps %xmm0, (%rax) 2866; SSE-NEXT: addq $664, %rsp # imm = 0x298 2867; SSE-NEXT: retq 2868; 2869; AVX-LABEL: store_i64_stride8_vf16: 2870; AVX: # %bb.0: 2871; AVX-NEXT: subq $488, %rsp # imm = 0x1E8 2872; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 2873; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 2874; AVX-NEXT: vmovaps (%rsi), %xmm4 2875; AVX-NEXT: vmovaps (%rdi), %xmm5 2876; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm4[0] 2877; AVX-NEXT: vmovaps (%rcx), %xmm6 2878; AVX-NEXT: vmovaps 32(%rcx), %xmm2 2879; AVX-NEXT: vmovaps 64(%rcx), %xmm0 2880; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm3 2881; AVX-NEXT: vinsertf128 $1, (%rdx), %ymm1, %ymm1 2882; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[2] 2883; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2884; AVX-NEXT: vmovaps (%r9), %xmm7 2885; AVX-NEXT: vmovaps (%r8), %xmm8 2886; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] 2887; AVX-NEXT: vmovaps (%rax), %xmm10 2888; AVX-NEXT: vmovaps 32(%rax), %xmm3 2889; AVX-NEXT: vmovaps 64(%rax), %xmm1 2890; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm11 2891; AVX-NEXT: vinsertf128 $1, (%r10), %ymm9, %ymm9 2892; AVX-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2],ymm11[2] 2893; AVX-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2894; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] 2895; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm5 2896; AVX-NEXT: vbroadcastsd 8(%rdx), %ymm6 2897; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] 2898; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] 2899; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2900; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1] 2901; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm5 2902; AVX-NEXT: vbroadcastsd 8(%r10), %ymm6 2903; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] 2904; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] 2905; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2906; AVX-NEXT: vmovaps 32(%rsi), %xmm4 2907; AVX-NEXT: vmovaps 32(%rdi), %xmm5 2908; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] 2909; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm7 2910; AVX-NEXT: vinsertf128 $1, 32(%rdx), %ymm6, %ymm6 2911; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] 2912; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2913; AVX-NEXT: vmovaps 32(%r9), %xmm6 2914; AVX-NEXT: vmovaps 32(%r8), %xmm7 2915; AVX-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] 2916; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm9 2917; AVX-NEXT: vinsertf128 $1, 32(%r10), %ymm8, %ymm8 2918; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2],ymm9[2] 2919; AVX-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2920; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] 2921; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 2922; AVX-NEXT: vbroadcastsd 40(%rdx), %ymm5 2923; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] 2924; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] 2925; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2926; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] 2927; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 2928; AVX-NEXT: vbroadcastsd 40(%r10), %ymm4 2929; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 2930; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] 2931; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2932; AVX-NEXT: vmovaps 64(%rsi), %xmm2 2933; AVX-NEXT: vmovaps 64(%rdi), %xmm3 2934; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 2935; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm5 2936; AVX-NEXT: vinsertf128 $1, 64(%rdx), %ymm4, %ymm4 2937; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[2] 2938; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2939; AVX-NEXT: vmovaps 64(%r9), %xmm4 2940; AVX-NEXT: vmovaps 64(%r8), %xmm5 2941; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] 2942; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm7 2943; AVX-NEXT: vinsertf128 $1, 64(%r10), %ymm6, %ymm6 2944; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] 2945; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2946; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] 2947; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 2948; AVX-NEXT: vbroadcastsd 72(%rdx), %ymm3 2949; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 2950; AVX-NEXT: vmovaps 96(%rcx), %xmm3 2951; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] 2952; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2953; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] 2954; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 2955; AVX-NEXT: vbroadcastsd 72(%r10), %ymm2 2956; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 2957; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 2958; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2959; AVX-NEXT: vmovaps 96(%rsi), %xmm0 2960; AVX-NEXT: vmovaps 96(%rdi), %xmm1 2961; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 2962; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 2963; AVX-NEXT: vinsertf128 $1, 96(%rdx), %ymm2, %ymm2 2964; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] 2965; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2966; AVX-NEXT: vmovaps 96(%rax), %xmm2 2967; AVX-NEXT: vmovaps 96(%r9), %xmm4 2968; AVX-NEXT: vmovaps 96(%r8), %xmm5 2969; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] 2970; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm7 2971; AVX-NEXT: vinsertf128 $1, 96(%r10), %ymm6, %ymm6 2972; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] 2973; AVX-NEXT: vmovupd %ymm6, (%rsp) # 32-byte Spill 2974; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2975; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 2976; AVX-NEXT: vbroadcastsd 104(%rdx), %ymm3 2977; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 2978; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 2979; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2980; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] 2981; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 2982; AVX-NEXT: vbroadcastsd 104(%r10), %ymm2 2983; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 2984; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 2985; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2986; AVX-NEXT: vmovaps 16(%rsi), %xmm0 2987; AVX-NEXT: vmovaps 16(%rdi), %xmm1 2988; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 2989; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 2990; AVX-NEXT: vbroadcastsd 16(%rcx), %ymm3 2991; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 2992; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2993; AVX-NEXT: vmovaps 16(%r9), %xmm2 2994; AVX-NEXT: vmovaps 16(%r8), %xmm3 2995; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 2996; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 2997; AVX-NEXT: vbroadcastsd 16(%rax), %ymm5 2998; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 2999; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3000; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 3001; AVX-NEXT: vbroadcastsd 24(%rdx), %ymm1 3002; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3003; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 3004; AVX-NEXT: vbroadcastsd 24(%r10), %ymm1 3005; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3006; AVX-NEXT: vmovaps 48(%rsi), %xmm0 3007; AVX-NEXT: vmovaps 48(%rdi), %xmm1 3008; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 3009; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 3010; AVX-NEXT: vbroadcastsd 48(%rcx), %ymm3 3011; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 3012; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3013; AVX-NEXT: vmovaps 48(%r9), %xmm2 3014; AVX-NEXT: vmovaps 48(%r8), %xmm4 3015; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm2[0] 3016; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] 3017; AVX-NEXT: vbroadcastsd 48(%rax), %ymm7 3018; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3,4,5],ymm7[6,7] 3019; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 3020; AVX-NEXT: vbroadcastsd 56(%rdx), %ymm1 3021; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3022; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] 3023; AVX-NEXT: vbroadcastsd 56(%r10), %ymm1 3024; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3025; AVX-NEXT: vmovaps 80(%rsi), %xmm5 3026; AVX-NEXT: vmovaps 80(%rdi), %xmm2 3027; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm5[0] 3028; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] 3029; AVX-NEXT: vbroadcastsd 80(%rcx), %ymm4 3030; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3,4,5],ymm4[6,7] 3031; AVX-NEXT: vmovaps 80(%r9), %xmm1 3032; AVX-NEXT: vmovaps 80(%r8), %xmm0 3033; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm1[0] 3034; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] 3035; AVX-NEXT: vbroadcastsd 80(%rax), %ymm14 3036; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm14[6,7] 3037; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] 3038; AVX-NEXT: vbroadcastsd 88(%rdx), %ymm5 3039; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm5[4,5,6,7] 3040; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 3041; AVX-NEXT: vbroadcastsd 88(%r10), %ymm1 3042; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3043; AVX-NEXT: vmovaps 112(%rsi), %xmm1 3044; AVX-NEXT: vmovaps 112(%rdi), %xmm5 3045; AVX-NEXT: vmovlhps {{.*#+}} xmm14 = xmm5[0],xmm1[0] 3046; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] 3047; AVX-NEXT: vbroadcastsd 112(%rcx), %ymm15 3048; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 3049; AVX-NEXT: vmovaps 112(%r9), %xmm15 3050; AVX-NEXT: vmovaps 112(%r8), %xmm0 3051; AVX-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm15[0] 3052; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] 3053; AVX-NEXT: vbroadcastsd 112(%rax), %ymm12 3054; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 3055; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm1[1] 3056; AVX-NEXT: vbroadcastsd 120(%rdx), %ymm5 3057; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] 3058; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] 3059; AVX-NEXT: vbroadcastsd 120(%r10), %ymm5 3060; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] 3061; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdx 3062; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] 3063; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],mem[6,7] 3064; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] 3065; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] 3066; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] 3067; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] 3068; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] 3069; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 3070; AVX-NEXT: vmovaps %ymm0, 992(%rdx) 3071; AVX-NEXT: vmovaps %ymm1, 960(%rdx) 3072; AVX-NEXT: vmovaps %ymm12, 928(%rdx) 3073; AVX-NEXT: vmovaps %ymm14, 896(%rdx) 3074; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3075; AVX-NEXT: vmovaps %ymm0, 864(%rdx) 3076; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3077; AVX-NEXT: vmovaps %ymm0, 832(%rdx) 3078; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3079; AVX-NEXT: vmovaps %ymm0, 800(%rdx) 3080; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3081; AVX-NEXT: vmovaps %ymm0, 768(%rdx) 3082; AVX-NEXT: vmovaps %ymm2, 736(%rdx) 3083; AVX-NEXT: vmovaps %ymm3, 704(%rdx) 3084; AVX-NEXT: vmovaps %ymm4, 672(%rdx) 3085; AVX-NEXT: vmovaps %ymm9, 640(%rdx) 3086; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3087; AVX-NEXT: vmovaps %ymm0, 608(%rdx) 3088; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3089; AVX-NEXT: vmovaps %ymm0, 576(%rdx) 3090; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3091; AVX-NEXT: vmovaps %ymm0, 544(%rdx) 3092; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3093; AVX-NEXT: vmovaps %ymm0, 512(%rdx) 3094; AVX-NEXT: vmovaps %ymm7, 480(%rdx) 3095; AVX-NEXT: vmovaps %ymm8, 448(%rdx) 3096; AVX-NEXT: vmovaps %ymm11, 416(%rdx) 3097; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3098; AVX-NEXT: vmovaps %ymm0, 384(%rdx) 3099; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3100; AVX-NEXT: vmovaps %ymm0, 352(%rdx) 3101; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3102; AVX-NEXT: vmovaps %ymm0, 320(%rdx) 3103; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3104; AVX-NEXT: vmovaps %ymm0, 288(%rdx) 3105; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3106; AVX-NEXT: vmovaps %ymm0, 256(%rdx) 3107; AVX-NEXT: vmovaps %ymm5, 224(%rdx) 3108; AVX-NEXT: vmovaps %ymm6, 192(%rdx) 3109; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3110; AVX-NEXT: vmovaps %ymm0, 160(%rdx) 3111; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3112; AVX-NEXT: vmovaps %ymm0, 128(%rdx) 3113; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3114; AVX-NEXT: vmovaps %ymm0, 96(%rdx) 3115; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3116; AVX-NEXT: vmovaps %ymm0, 64(%rdx) 3117; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3118; AVX-NEXT: vmovaps %ymm0, 32(%rdx) 3119; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3120; AVX-NEXT: vmovaps %ymm0, (%rdx) 3121; AVX-NEXT: addq $488, %rsp # imm = 0x1E8 3122; AVX-NEXT: vzeroupper 3123; AVX-NEXT: retq 3124; 3125; AVX2-LABEL: store_i64_stride8_vf16: 3126; AVX2: # %bb.0: 3127; AVX2-NEXT: subq $488, %rsp # imm = 0x1E8 3128; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 3129; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 3130; AVX2-NEXT: vmovaps (%rcx), %xmm0 3131; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3132; AVX2-NEXT: vmovaps 32(%rcx), %xmm3 3133; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3134; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 3135; AVX2-NEXT: vmovaps (%rsi), %xmm2 3136; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3137; AVX2-NEXT: vmovaps 32(%rsi), %xmm4 3138; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3139; AVX2-NEXT: vmovaps (%rdi), %xmm1 3140; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3141; AVX2-NEXT: vmovaps 32(%rdi), %xmm5 3142; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3143; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 3144; AVX2-NEXT: vbroadcastsd 8(%rdx), %ymm2 3145; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3146; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 3147; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3148; AVX2-NEXT: vmovaps (%rax), %xmm0 3149; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3150; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 3151; AVX2-NEXT: vmovaps (%r9), %xmm1 3152; AVX2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill 3153; AVX2-NEXT: vmovaps 32(%r9), %xmm6 3154; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3155; AVX2-NEXT: vmovaps (%r8), %xmm15 3156; AVX2-NEXT: vmovaps 32(%r8), %xmm7 3157; AVX2-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3158; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] 3159; AVX2-NEXT: vbroadcastsd 8(%r10), %ymm2 3160; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3161; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 3162; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3163; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 3164; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] 3165; AVX2-NEXT: vbroadcastsd 40(%rdx), %ymm2 3166; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3167; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 3168; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3169; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] 3170; AVX2-NEXT: vbroadcastsd 40(%r10), %ymm1 3171; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3172; AVX2-NEXT: vmovaps 32(%rax), %xmm12 3173; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 3174; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 3175; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3176; AVX2-NEXT: vmovaps 64(%rsi), %xmm11 3177; AVX2-NEXT: vmovaps 64(%rdi), %xmm10 3178; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] 3179; AVX2-NEXT: vbroadcastsd 72(%rdx), %ymm1 3180; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3181; AVX2-NEXT: vmovaps 64(%rcx), %xmm9 3182; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 3183; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 3184; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3185; AVX2-NEXT: vmovaps 64(%r9), %xmm8 3186; AVX2-NEXT: vmovaps 64(%r8), %xmm7 3187; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] 3188; AVX2-NEXT: vbroadcastsd 72(%r10), %ymm1 3189; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3190; AVX2-NEXT: vmovaps 64(%rax), %xmm6 3191; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 3192; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 3193; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3194; AVX2-NEXT: vmovaps 96(%rsi), %xmm5 3195; AVX2-NEXT: vmovaps 96(%rdi), %xmm4 3196; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] 3197; AVX2-NEXT: vbroadcastsd 104(%rdx), %ymm1 3198; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3199; AVX2-NEXT: vmovaps 96(%rcx), %xmm3 3200; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 3201; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 3202; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3203; AVX2-NEXT: vmovaps 96(%r9), %xmm2 3204; AVX2-NEXT: vmovaps 96(%r8), %xmm1 3205; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] 3206; AVX2-NEXT: vbroadcastsd 104(%r10), %ymm14 3207; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm14[4,5,6,7] 3208; AVX2-NEXT: vmovaps 96(%rax), %xmm0 3209; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 3210; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] 3211; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3212; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3213; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 3214; AVX2-NEXT: # xmm13 = xmm13[0],mem[0] 3215; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm13, %ymm13 3216; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload 3217; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] 3218; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3219; AVX2-NEXT: vunpcklpd (%rsp), %xmm15, %xmm13 # 16-byte Folded Reload 3220; AVX2-NEXT: # xmm13 = xmm15[0],mem[0] 3221; AVX2-NEXT: vinsertf128 $1, (%r10), %ymm13, %ymm13 3222; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload 3223; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] 3224; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3225; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3226; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 3227; AVX2-NEXT: # xmm13 = xmm13[0],mem[0] 3228; AVX2-NEXT: vinsertf128 $1, 32(%rdx), %ymm13, %ymm13 3229; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload 3230; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] 3231; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3232; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3233; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 3234; AVX2-NEXT: # xmm13 = xmm13[0],mem[0] 3235; AVX2-NEXT: vinsertf128 $1, 32(%r10), %ymm13, %ymm13 3236; AVX2-NEXT: vbroadcastsd %xmm12, %ymm12 3237; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 3238; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3239; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] 3240; AVX2-NEXT: vinsertf128 $1, 64(%rdx), %ymm10, %ymm10 3241; AVX2-NEXT: vbroadcastsd %xmm9, %ymm9 3242; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] 3243; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3244; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] 3245; AVX2-NEXT: vinsertf128 $1, 64(%r10), %ymm7, %ymm7 3246; AVX2-NEXT: vbroadcastsd %xmm6, %ymm6 3247; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] 3248; AVX2-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill 3249; AVX2-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 3250; AVX2-NEXT: vinsertf128 $1, 96(%rdx), %ymm4, %ymm4 3251; AVX2-NEXT: vbroadcastsd %xmm3, %ymm3 3252; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] 3253; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3254; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 3255; AVX2-NEXT: vinsertf128 $1, 96(%r10), %ymm1, %ymm1 3256; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 3257; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 3258; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3259; AVX2-NEXT: vmovaps (%rdi), %ymm0 3260; AVX2-NEXT: vmovaps (%rsi), %ymm1 3261; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 3262; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 3263; AVX2-NEXT: vbroadcastsd 16(%rcx), %ymm3 3264; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 3265; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3266; AVX2-NEXT: vmovaps (%r8), %ymm2 3267; AVX2-NEXT: vmovaps (%r9), %ymm3 3268; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 3269; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 3270; AVX2-NEXT: vbroadcastsd 16(%rax), %ymm6 3271; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] 3272; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3273; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 3274; AVX2-NEXT: vbroadcastsd 24(%rdx), %ymm1 3275; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] 3276; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 3277; AVX2-NEXT: vbroadcastsd 24(%r10), %ymm2 3278; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3] 3279; AVX2-NEXT: vmovaps 32(%rdi), %ymm6 3280; AVX2-NEXT: vmovaps 32(%rsi), %ymm7 3281; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] 3282; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 3283; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm3 3284; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] 3285; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3286; AVX2-NEXT: vmovaps 32(%r8), %ymm8 3287; AVX2-NEXT: vmovaps 32(%r9), %ymm9 3288; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] 3289; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 3290; AVX2-NEXT: vbroadcastsd 48(%rax), %ymm10 3291; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7] 3292; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] 3293; AVX2-NEXT: vbroadcastsd 56(%rdx), %ymm7 3294; AVX2-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm6[2,3],ymm7[2,3] 3295; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] 3296; AVX2-NEXT: vbroadcastsd 56(%r10), %ymm8 3297; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm6[2,3],ymm8[2,3] 3298; AVX2-NEXT: vmovaps 64(%rdi), %ymm10 3299; AVX2-NEXT: vmovaps 64(%rsi), %ymm11 3300; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] 3301; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],mem[2,3] 3302; AVX2-NEXT: vbroadcastsd 80(%rcx), %ymm9 3303; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] 3304; AVX2-NEXT: vmovaps 64(%r8), %ymm1 3305; AVX2-NEXT: vmovaps 64(%r9), %ymm0 3306; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 3307; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],mem[2,3] 3308; AVX2-NEXT: vbroadcastsd 80(%rax), %ymm14 3309; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] 3310; AVX2-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] 3311; AVX2-NEXT: vbroadcastsd 88(%rdx), %ymm11 3312; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3] 3313; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 3314; AVX2-NEXT: vbroadcastsd 88(%r10), %ymm1 3315; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 3316; AVX2-NEXT: vmovaps 96(%rdi), %ymm1 3317; AVX2-NEXT: vmovaps 96(%rsi), %ymm11 3318; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm11[0],ymm1[2],ymm11[2] 3319; AVX2-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],mem[2,3] 3320; AVX2-NEXT: vbroadcastsd 112(%rcx), %ymm15 3321; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 3322; AVX2-NEXT: vmovaps 96(%r8), %ymm15 3323; AVX2-NEXT: vmovaps 96(%r9), %ymm0 3324; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] 3325; AVX2-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],mem[2,3] 3326; AVX2-NEXT: vbroadcastsd 112(%rax), %ymm12 3327; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 3328; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] 3329; AVX2-NEXT: vbroadcastsd 120(%rdx), %ymm11 3330; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm11[2,3] 3331; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] 3332; AVX2-NEXT: vbroadcastsd 120(%r10), %ymm11 3333; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm11[2,3] 3334; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx 3335; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] 3336; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] 3337; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] 3338; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] 3339; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] 3340; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] 3341; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] 3342; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 3343; AVX2-NEXT: vmovaps %ymm0, 992(%rdx) 3344; AVX2-NEXT: vmovaps %ymm1, 960(%rdx) 3345; AVX2-NEXT: vmovaps %ymm12, 928(%rdx) 3346; AVX2-NEXT: vmovaps %ymm14, 896(%rdx) 3347; AVX2-NEXT: vmovaps %ymm2, 736(%rdx) 3348; AVX2-NEXT: vmovaps %ymm10, 704(%rdx) 3349; AVX2-NEXT: vmovaps %ymm9, 672(%rdx) 3350; AVX2-NEXT: vmovaps %ymm6, 640(%rdx) 3351; AVX2-NEXT: vmovaps %ymm8, 480(%rdx) 3352; AVX2-NEXT: vmovaps %ymm7, 448(%rdx) 3353; AVX2-NEXT: vmovaps %ymm3, 416(%rdx) 3354; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3355; AVX2-NEXT: vmovaps %ymm0, 384(%rdx) 3356; AVX2-NEXT: vmovaps %ymm4, 224(%rdx) 3357; AVX2-NEXT: vmovaps %ymm5, 192(%rdx) 3358; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3359; AVX2-NEXT: vmovaps %ymm0, 160(%rdx) 3360; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3361; AVX2-NEXT: vmovaps %ymm0, 128(%rdx) 3362; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3363; AVX2-NEXT: vmovaps %ymm0, 864(%rdx) 3364; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3365; AVX2-NEXT: vmovaps %ymm0, 832(%rdx) 3366; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3367; AVX2-NEXT: vmovaps %ymm0, 800(%rdx) 3368; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3369; AVX2-NEXT: vmovaps %ymm0, 768(%rdx) 3370; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3371; AVX2-NEXT: vmovaps %ymm0, 608(%rdx) 3372; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3373; AVX2-NEXT: vmovaps %ymm0, 576(%rdx) 3374; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3375; AVX2-NEXT: vmovaps %ymm0, 544(%rdx) 3376; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3377; AVX2-NEXT: vmovaps %ymm0, 512(%rdx) 3378; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3379; AVX2-NEXT: vmovaps %ymm0, 352(%rdx) 3380; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3381; AVX2-NEXT: vmovaps %ymm0, 320(%rdx) 3382; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3383; AVX2-NEXT: vmovaps %ymm0, 288(%rdx) 3384; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3385; AVX2-NEXT: vmovaps %ymm0, 256(%rdx) 3386; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3387; AVX2-NEXT: vmovaps %ymm0, 96(%rdx) 3388; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3389; AVX2-NEXT: vmovaps %ymm0, 64(%rdx) 3390; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3391; AVX2-NEXT: vmovaps %ymm0, 32(%rdx) 3392; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3393; AVX2-NEXT: vmovaps %ymm0, (%rdx) 3394; AVX2-NEXT: addq $488, %rsp # imm = 0x1E8 3395; AVX2-NEXT: vzeroupper 3396; AVX2-NEXT: retq 3397; 3398; AVX2-FP-LABEL: store_i64_stride8_vf16: 3399; AVX2-FP: # %bb.0: 3400; AVX2-FP-NEXT: subq $488, %rsp # imm = 0x1E8 3401; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 3402; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3403; AVX2-FP-NEXT: vmovaps (%rcx), %xmm0 3404; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3405; AVX2-FP-NEXT: vmovaps 32(%rcx), %xmm3 3406; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3407; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 3408; AVX2-FP-NEXT: vmovaps (%rsi), %xmm2 3409; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3410; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm4 3411; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3412; AVX2-FP-NEXT: vmovaps (%rdi), %xmm1 3413; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3414; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm5 3415; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3416; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 3417; AVX2-FP-NEXT: vbroadcastsd 8(%rdx), %ymm2 3418; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3419; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 3420; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3421; AVX2-FP-NEXT: vmovaps (%rax), %xmm0 3422; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3423; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 3424; AVX2-FP-NEXT: vmovaps (%r9), %xmm1 3425; AVX2-FP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill 3426; AVX2-FP-NEXT: vmovaps 32(%r9), %xmm6 3427; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3428; AVX2-FP-NEXT: vmovaps (%r8), %xmm15 3429; AVX2-FP-NEXT: vmovaps 32(%r8), %xmm7 3430; AVX2-FP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3431; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] 3432; AVX2-FP-NEXT: vbroadcastsd 8(%r10), %ymm2 3433; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3434; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 3435; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3436; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 3437; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] 3438; AVX2-FP-NEXT: vbroadcastsd 40(%rdx), %ymm2 3439; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3440; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 3441; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3442; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] 3443; AVX2-FP-NEXT: vbroadcastsd 40(%r10), %ymm1 3444; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3445; AVX2-FP-NEXT: vmovaps 32(%rax), %xmm12 3446; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 3447; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 3448; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3449; AVX2-FP-NEXT: vmovaps 64(%rsi), %xmm11 3450; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm10 3451; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] 3452; AVX2-FP-NEXT: vbroadcastsd 72(%rdx), %ymm1 3453; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3454; AVX2-FP-NEXT: vmovaps 64(%rcx), %xmm9 3455; AVX2-FP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 3456; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 3457; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3458; AVX2-FP-NEXT: vmovaps 64(%r9), %xmm8 3459; AVX2-FP-NEXT: vmovaps 64(%r8), %xmm7 3460; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] 3461; AVX2-FP-NEXT: vbroadcastsd 72(%r10), %ymm1 3462; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3463; AVX2-FP-NEXT: vmovaps 64(%rax), %xmm6 3464; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 3465; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 3466; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3467; AVX2-FP-NEXT: vmovaps 96(%rsi), %xmm5 3468; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm4 3469; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] 3470; AVX2-FP-NEXT: vbroadcastsd 104(%rdx), %ymm1 3471; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3472; AVX2-FP-NEXT: vmovaps 96(%rcx), %xmm3 3473; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 3474; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 3475; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3476; AVX2-FP-NEXT: vmovaps 96(%r9), %xmm2 3477; AVX2-FP-NEXT: vmovaps 96(%r8), %xmm1 3478; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] 3479; AVX2-FP-NEXT: vbroadcastsd 104(%r10), %ymm14 3480; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm14[4,5,6,7] 3481; AVX2-FP-NEXT: vmovaps 96(%rax), %xmm0 3482; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 3483; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] 3484; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3485; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3486; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 3487; AVX2-FP-NEXT: # xmm13 = xmm13[0],mem[0] 3488; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm13, %ymm13 3489; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload 3490; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] 3491; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3492; AVX2-FP-NEXT: vunpcklpd (%rsp), %xmm15, %xmm13 # 16-byte Folded Reload 3493; AVX2-FP-NEXT: # xmm13 = xmm15[0],mem[0] 3494; AVX2-FP-NEXT: vinsertf128 $1, (%r10), %ymm13, %ymm13 3495; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload 3496; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] 3497; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3498; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3499; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 3500; AVX2-FP-NEXT: # xmm13 = xmm13[0],mem[0] 3501; AVX2-FP-NEXT: vinsertf128 $1, 32(%rdx), %ymm13, %ymm13 3502; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload 3503; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] 3504; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3505; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3506; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 3507; AVX2-FP-NEXT: # xmm13 = xmm13[0],mem[0] 3508; AVX2-FP-NEXT: vinsertf128 $1, 32(%r10), %ymm13, %ymm13 3509; AVX2-FP-NEXT: vbroadcastsd %xmm12, %ymm12 3510; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 3511; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3512; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] 3513; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdx), %ymm10, %ymm10 3514; AVX2-FP-NEXT: vbroadcastsd %xmm9, %ymm9 3515; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] 3516; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3517; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] 3518; AVX2-FP-NEXT: vinsertf128 $1, 64(%r10), %ymm7, %ymm7 3519; AVX2-FP-NEXT: vbroadcastsd %xmm6, %ymm6 3520; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] 3521; AVX2-FP-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill 3522; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 3523; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdx), %ymm4, %ymm4 3524; AVX2-FP-NEXT: vbroadcastsd %xmm3, %ymm3 3525; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] 3526; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3527; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 3528; AVX2-FP-NEXT: vinsertf128 $1, 96(%r10), %ymm1, %ymm1 3529; AVX2-FP-NEXT: vbroadcastsd %xmm0, %ymm0 3530; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 3531; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3532; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 3533; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1 3534; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 3535; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 3536; AVX2-FP-NEXT: vbroadcastsd 16(%rcx), %ymm3 3537; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 3538; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3539; AVX2-FP-NEXT: vmovaps (%r8), %ymm2 3540; AVX2-FP-NEXT: vmovaps (%r9), %ymm3 3541; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 3542; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 3543; AVX2-FP-NEXT: vbroadcastsd 16(%rax), %ymm6 3544; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] 3545; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3546; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 3547; AVX2-FP-NEXT: vbroadcastsd 24(%rdx), %ymm1 3548; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] 3549; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 3550; AVX2-FP-NEXT: vbroadcastsd 24(%r10), %ymm2 3551; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3] 3552; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm6 3553; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm7 3554; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] 3555; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 3556; AVX2-FP-NEXT: vbroadcastsd 48(%rcx), %ymm3 3557; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] 3558; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3559; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm8 3560; AVX2-FP-NEXT: vmovaps 32(%r9), %ymm9 3561; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] 3562; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 3563; AVX2-FP-NEXT: vbroadcastsd 48(%rax), %ymm10 3564; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7] 3565; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] 3566; AVX2-FP-NEXT: vbroadcastsd 56(%rdx), %ymm7 3567; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm6[2,3],ymm7[2,3] 3568; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] 3569; AVX2-FP-NEXT: vbroadcastsd 56(%r10), %ymm8 3570; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm6[2,3],ymm8[2,3] 3571; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm10 3572; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm11 3573; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] 3574; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],mem[2,3] 3575; AVX2-FP-NEXT: vbroadcastsd 80(%rcx), %ymm9 3576; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] 3577; AVX2-FP-NEXT: vmovaps 64(%r8), %ymm1 3578; AVX2-FP-NEXT: vmovaps 64(%r9), %ymm0 3579; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 3580; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],mem[2,3] 3581; AVX2-FP-NEXT: vbroadcastsd 80(%rax), %ymm14 3582; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] 3583; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] 3584; AVX2-FP-NEXT: vbroadcastsd 88(%rdx), %ymm11 3585; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3] 3586; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 3587; AVX2-FP-NEXT: vbroadcastsd 88(%r10), %ymm1 3588; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 3589; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm1 3590; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm11 3591; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm11[0],ymm1[2],ymm11[2] 3592; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],mem[2,3] 3593; AVX2-FP-NEXT: vbroadcastsd 112(%rcx), %ymm15 3594; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 3595; AVX2-FP-NEXT: vmovaps 96(%r8), %ymm15 3596; AVX2-FP-NEXT: vmovaps 96(%r9), %ymm0 3597; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] 3598; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],mem[2,3] 3599; AVX2-FP-NEXT: vbroadcastsd 112(%rax), %ymm12 3600; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 3601; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] 3602; AVX2-FP-NEXT: vbroadcastsd 120(%rdx), %ymm11 3603; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm11[2,3] 3604; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] 3605; AVX2-FP-NEXT: vbroadcastsd 120(%r10), %ymm11 3606; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm11[2,3] 3607; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rdx 3608; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] 3609; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] 3610; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] 3611; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] 3612; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] 3613; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] 3614; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] 3615; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 3616; AVX2-FP-NEXT: vmovaps %ymm0, 992(%rdx) 3617; AVX2-FP-NEXT: vmovaps %ymm1, 960(%rdx) 3618; AVX2-FP-NEXT: vmovaps %ymm12, 928(%rdx) 3619; AVX2-FP-NEXT: vmovaps %ymm14, 896(%rdx) 3620; AVX2-FP-NEXT: vmovaps %ymm2, 736(%rdx) 3621; AVX2-FP-NEXT: vmovaps %ymm10, 704(%rdx) 3622; AVX2-FP-NEXT: vmovaps %ymm9, 672(%rdx) 3623; AVX2-FP-NEXT: vmovaps %ymm6, 640(%rdx) 3624; AVX2-FP-NEXT: vmovaps %ymm8, 480(%rdx) 3625; AVX2-FP-NEXT: vmovaps %ymm7, 448(%rdx) 3626; AVX2-FP-NEXT: vmovaps %ymm3, 416(%rdx) 3627; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3628; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rdx) 3629; AVX2-FP-NEXT: vmovaps %ymm4, 224(%rdx) 3630; AVX2-FP-NEXT: vmovaps %ymm5, 192(%rdx) 3631; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3632; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rdx) 3633; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3634; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rdx) 3635; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3636; AVX2-FP-NEXT: vmovaps %ymm0, 864(%rdx) 3637; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3638; AVX2-FP-NEXT: vmovaps %ymm0, 832(%rdx) 3639; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3640; AVX2-FP-NEXT: vmovaps %ymm0, 800(%rdx) 3641; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3642; AVX2-FP-NEXT: vmovaps %ymm0, 768(%rdx) 3643; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3644; AVX2-FP-NEXT: vmovaps %ymm0, 608(%rdx) 3645; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3646; AVX2-FP-NEXT: vmovaps %ymm0, 576(%rdx) 3647; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3648; AVX2-FP-NEXT: vmovaps %ymm0, 544(%rdx) 3649; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3650; AVX2-FP-NEXT: vmovaps %ymm0, 512(%rdx) 3651; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3652; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rdx) 3653; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3654; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rdx) 3655; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3656; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rdx) 3657; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3658; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rdx) 3659; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3660; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rdx) 3661; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3662; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rdx) 3663; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3664; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rdx) 3665; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3666; AVX2-FP-NEXT: vmovaps %ymm0, (%rdx) 3667; AVX2-FP-NEXT: addq $488, %rsp # imm = 0x1E8 3668; AVX2-FP-NEXT: vzeroupper 3669; AVX2-FP-NEXT: retq 3670; 3671; AVX2-FCP-LABEL: store_i64_stride8_vf16: 3672; AVX2-FCP: # %bb.0: 3673; AVX2-FCP-NEXT: subq $488, %rsp # imm = 0x1E8 3674; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 3675; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3676; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm0 3677; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3678; AVX2-FCP-NEXT: vmovaps 32(%rcx), %xmm3 3679; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3680; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 3681; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm2 3682; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3683; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm4 3684; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3685; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm1 3686; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3687; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm5 3688; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3689; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 3690; AVX2-FCP-NEXT: vbroadcastsd 8(%rdx), %ymm2 3691; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3692; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 3693; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3694; AVX2-FCP-NEXT: vmovaps (%rax), %xmm0 3695; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3696; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 3697; AVX2-FCP-NEXT: vmovaps (%r9), %xmm1 3698; AVX2-FCP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill 3699; AVX2-FCP-NEXT: vmovaps 32(%r9), %xmm6 3700; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3701; AVX2-FCP-NEXT: vmovaps (%r8), %xmm15 3702; AVX2-FCP-NEXT: vmovaps 32(%r8), %xmm7 3703; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3704; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] 3705; AVX2-FCP-NEXT: vbroadcastsd 8(%r10), %ymm2 3706; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3707; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 3708; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3709; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 3710; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] 3711; AVX2-FCP-NEXT: vbroadcastsd 40(%rdx), %ymm2 3712; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3713; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 3714; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3715; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] 3716; AVX2-FCP-NEXT: vbroadcastsd 40(%r10), %ymm1 3717; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3718; AVX2-FCP-NEXT: vmovaps 32(%rax), %xmm12 3719; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 3720; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 3721; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3722; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm11 3723; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm10 3724; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] 3725; AVX2-FCP-NEXT: vbroadcastsd 72(%rdx), %ymm1 3726; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3727; AVX2-FCP-NEXT: vmovaps 64(%rcx), %xmm9 3728; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 3729; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 3730; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3731; AVX2-FCP-NEXT: vmovaps 64(%r9), %xmm8 3732; AVX2-FCP-NEXT: vmovaps 64(%r8), %xmm7 3733; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] 3734; AVX2-FCP-NEXT: vbroadcastsd 72(%r10), %ymm1 3735; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3736; AVX2-FCP-NEXT: vmovaps 64(%rax), %xmm6 3737; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 3738; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 3739; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3740; AVX2-FCP-NEXT: vmovaps 96(%rsi), %xmm5 3741; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm4 3742; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] 3743; AVX2-FCP-NEXT: vbroadcastsd 104(%rdx), %ymm1 3744; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3745; AVX2-FCP-NEXT: vmovaps 96(%rcx), %xmm3 3746; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 3747; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 3748; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3749; AVX2-FCP-NEXT: vmovaps 96(%r9), %xmm2 3750; AVX2-FCP-NEXT: vmovaps 96(%r8), %xmm1 3751; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] 3752; AVX2-FCP-NEXT: vbroadcastsd 104(%r10), %ymm14 3753; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm14[4,5,6,7] 3754; AVX2-FCP-NEXT: vmovaps 96(%rax), %xmm0 3755; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 3756; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] 3757; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3758; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3759; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 3760; AVX2-FCP-NEXT: # xmm13 = xmm13[0],mem[0] 3761; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm13, %ymm13 3762; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload 3763; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] 3764; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3765; AVX2-FCP-NEXT: vunpcklpd (%rsp), %xmm15, %xmm13 # 16-byte Folded Reload 3766; AVX2-FCP-NEXT: # xmm13 = xmm15[0],mem[0] 3767; AVX2-FCP-NEXT: vinsertf128 $1, (%r10), %ymm13, %ymm13 3768; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload 3769; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] 3770; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3771; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3772; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 3773; AVX2-FCP-NEXT: # xmm13 = xmm13[0],mem[0] 3774; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rdx), %ymm13, %ymm13 3775; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload 3776; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] 3777; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3778; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3779; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 3780; AVX2-FCP-NEXT: # xmm13 = xmm13[0],mem[0] 3781; AVX2-FCP-NEXT: vinsertf128 $1, 32(%r10), %ymm13, %ymm13 3782; AVX2-FCP-NEXT: vbroadcastsd %xmm12, %ymm12 3783; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 3784; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3785; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] 3786; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdx), %ymm10, %ymm10 3787; AVX2-FCP-NEXT: vbroadcastsd %xmm9, %ymm9 3788; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] 3789; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3790; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] 3791; AVX2-FCP-NEXT: vinsertf128 $1, 64(%r10), %ymm7, %ymm7 3792; AVX2-FCP-NEXT: vbroadcastsd %xmm6, %ymm6 3793; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] 3794; AVX2-FCP-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill 3795; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 3796; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdx), %ymm4, %ymm4 3797; AVX2-FCP-NEXT: vbroadcastsd %xmm3, %ymm3 3798; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] 3799; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3800; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 3801; AVX2-FCP-NEXT: vinsertf128 $1, 96(%r10), %ymm1, %ymm1 3802; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0 3803; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 3804; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3805; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 3806; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1 3807; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 3808; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 3809; AVX2-FCP-NEXT: vbroadcastsd 16(%rcx), %ymm3 3810; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 3811; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3812; AVX2-FCP-NEXT: vmovaps (%r8), %ymm2 3813; AVX2-FCP-NEXT: vmovaps (%r9), %ymm3 3814; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 3815; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 3816; AVX2-FCP-NEXT: vbroadcastsd 16(%rax), %ymm6 3817; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] 3818; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3819; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 3820; AVX2-FCP-NEXT: vbroadcastsd 24(%rdx), %ymm1 3821; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] 3822; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 3823; AVX2-FCP-NEXT: vbroadcastsd 24(%r10), %ymm2 3824; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3] 3825; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm6 3826; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm7 3827; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] 3828; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 3829; AVX2-FCP-NEXT: vbroadcastsd 48(%rcx), %ymm3 3830; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] 3831; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3832; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm8 3833; AVX2-FCP-NEXT: vmovaps 32(%r9), %ymm9 3834; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] 3835; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 3836; AVX2-FCP-NEXT: vbroadcastsd 48(%rax), %ymm10 3837; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7] 3838; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] 3839; AVX2-FCP-NEXT: vbroadcastsd 56(%rdx), %ymm7 3840; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm6[2,3],ymm7[2,3] 3841; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] 3842; AVX2-FCP-NEXT: vbroadcastsd 56(%r10), %ymm8 3843; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm6[2,3],ymm8[2,3] 3844; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm10 3845; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm11 3846; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] 3847; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],mem[2,3] 3848; AVX2-FCP-NEXT: vbroadcastsd 80(%rcx), %ymm9 3849; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] 3850; AVX2-FCP-NEXT: vmovaps 64(%r8), %ymm1 3851; AVX2-FCP-NEXT: vmovaps 64(%r9), %ymm0 3852; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 3853; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],mem[2,3] 3854; AVX2-FCP-NEXT: vbroadcastsd 80(%rax), %ymm14 3855; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] 3856; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] 3857; AVX2-FCP-NEXT: vbroadcastsd 88(%rdx), %ymm11 3858; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3] 3859; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 3860; AVX2-FCP-NEXT: vbroadcastsd 88(%r10), %ymm1 3861; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 3862; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1 3863; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm11 3864; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm11[0],ymm1[2],ymm11[2] 3865; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],mem[2,3] 3866; AVX2-FCP-NEXT: vbroadcastsd 112(%rcx), %ymm15 3867; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 3868; AVX2-FCP-NEXT: vmovaps 96(%r8), %ymm15 3869; AVX2-FCP-NEXT: vmovaps 96(%r9), %ymm0 3870; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] 3871; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],mem[2,3] 3872; AVX2-FCP-NEXT: vbroadcastsd 112(%rax), %ymm12 3873; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 3874; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] 3875; AVX2-FCP-NEXT: vbroadcastsd 120(%rdx), %ymm11 3876; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm11[2,3] 3877; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] 3878; AVX2-FCP-NEXT: vbroadcastsd 120(%r10), %ymm11 3879; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm11[2,3] 3880; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdx 3881; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] 3882; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] 3883; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] 3884; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] 3885; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] 3886; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] 3887; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] 3888; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 3889; AVX2-FCP-NEXT: vmovaps %ymm0, 992(%rdx) 3890; AVX2-FCP-NEXT: vmovaps %ymm1, 960(%rdx) 3891; AVX2-FCP-NEXT: vmovaps %ymm12, 928(%rdx) 3892; AVX2-FCP-NEXT: vmovaps %ymm14, 896(%rdx) 3893; AVX2-FCP-NEXT: vmovaps %ymm2, 736(%rdx) 3894; AVX2-FCP-NEXT: vmovaps %ymm10, 704(%rdx) 3895; AVX2-FCP-NEXT: vmovaps %ymm9, 672(%rdx) 3896; AVX2-FCP-NEXT: vmovaps %ymm6, 640(%rdx) 3897; AVX2-FCP-NEXT: vmovaps %ymm8, 480(%rdx) 3898; AVX2-FCP-NEXT: vmovaps %ymm7, 448(%rdx) 3899; AVX2-FCP-NEXT: vmovaps %ymm3, 416(%rdx) 3900; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3901; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rdx) 3902; AVX2-FCP-NEXT: vmovaps %ymm4, 224(%rdx) 3903; AVX2-FCP-NEXT: vmovaps %ymm5, 192(%rdx) 3904; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3905; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rdx) 3906; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3907; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rdx) 3908; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3909; AVX2-FCP-NEXT: vmovaps %ymm0, 864(%rdx) 3910; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3911; AVX2-FCP-NEXT: vmovaps %ymm0, 832(%rdx) 3912; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3913; AVX2-FCP-NEXT: vmovaps %ymm0, 800(%rdx) 3914; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3915; AVX2-FCP-NEXT: vmovaps %ymm0, 768(%rdx) 3916; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3917; AVX2-FCP-NEXT: vmovaps %ymm0, 608(%rdx) 3918; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3919; AVX2-FCP-NEXT: vmovaps %ymm0, 576(%rdx) 3920; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3921; AVX2-FCP-NEXT: vmovaps %ymm0, 544(%rdx) 3922; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3923; AVX2-FCP-NEXT: vmovaps %ymm0, 512(%rdx) 3924; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3925; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%rdx) 3926; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3927; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rdx) 3928; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3929; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rdx) 3930; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3931; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rdx) 3932; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3933; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rdx) 3934; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3935; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rdx) 3936; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3937; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rdx) 3938; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3939; AVX2-FCP-NEXT: vmovaps %ymm0, (%rdx) 3940; AVX2-FCP-NEXT: addq $488, %rsp # imm = 0x1E8 3941; AVX2-FCP-NEXT: vzeroupper 3942; AVX2-FCP-NEXT: retq 3943; 3944; AVX512-LABEL: store_i64_stride8_vf16: 3945; AVX512: # %bb.0: 3946; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 3947; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 3948; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 3949; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm11 3950; AVX512-NEXT: vmovdqa64 (%rdi), %zmm15 3951; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm27 3952; AVX512-NEXT: vmovdqa64 (%rsi), %zmm17 3953; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm14 3954; AVX512-NEXT: vmovdqa64 (%rdx), %zmm5 3955; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm28 3956; AVX512-NEXT: vmovdqa64 (%rcx), %zmm26 3957; AVX512-NEXT: vmovdqa64 (%r8), %zmm6 3958; AVX512-NEXT: vmovdqa64 64(%r8), %zmm0 3959; AVX512-NEXT: vmovdqa64 (%r9), %zmm7 3960; AVX512-NEXT: vmovdqa64 64(%r9), %zmm1 3961; AVX512-NEXT: vmovdqa64 (%r11), %zmm8 3962; AVX512-NEXT: vmovdqa64 64(%r11), %zmm3 3963; AVX512-NEXT: vmovdqa64 (%r10), %zmm9 3964; AVX512-NEXT: vmovdqa64 64(%r10), %zmm4 3965; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] 3966; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3967; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 3968; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 3969; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 3970; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 3971; AVX512-NEXT: movb $-64, %r8b 3972; AVX512-NEXT: kmovw %r8d, %k1 3973; AVX512-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} 3974; AVX512-NEXT: vmovdqa (%rsi), %xmm10 3975; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 3976; AVX512-NEXT: vmovdqa64 (%rdi), %xmm16 3977; AVX512-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 3978; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] 3979; AVX512-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 3980; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] 3981; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3982; AVX512-NEXT: vmovdqa64 %zmm8, %zmm12 3983; AVX512-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 3984; AVX512-NEXT: vmovdqa64 %zmm6, %zmm20 3985; AVX512-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 3986; AVX512-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} 3987; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] 3988; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 3989; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] 3990; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3991; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 3992; AVX512-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 3993; AVX512-NEXT: vmovdqa64 %zmm15, %zmm12 3994; AVX512-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 3995; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] 3996; AVX512-NEXT: vmovdqa64 %zmm8, %zmm12 3997; AVX512-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 3998; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] 3999; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} 4000; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 4001; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] 4002; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4003; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 4004; AVX512-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 4005; AVX512-NEXT: vmovdqa64 %zmm15, %zmm2 4006; AVX512-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 4007; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] 4008; AVX512-NEXT: vmovdqa64 %zmm8, %zmm12 4009; AVX512-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 4010; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] 4011; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} 4012; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 4013; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] 4014; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4015; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2 4016; AVX512-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 4017; AVX512-NEXT: vmovdqa64 %zmm15, %zmm13 4018; AVX512-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 4019; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] 4020; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 4021; AVX512-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 4022; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] 4023; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 4024; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] 4025; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4026; AVX512-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 4027; AVX512-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 4028; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] 4029; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 4030; AVX512-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 4031; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] 4032; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 4033; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 4034; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 4035; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 4036; AVX512-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} 4037; AVX512-NEXT: vmovdqa 64(%rsi), %xmm2 4038; AVX512-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 4039; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5 4040; AVX512-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 4041; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] 4042; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 4043; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 4044; AVX512-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 4045; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 4046; AVX512-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} 4047; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] 4048; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 4049; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 4050; AVX512-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 4051; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 4052; AVX512-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 4053; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 4054; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 4055; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 4056; AVX512-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} 4057; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 4058; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 4059; AVX512-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 4060; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 4061; AVX512-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 4062; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 4063; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 4064; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 4065; AVX512-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} 4066; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 4067; AVX512-NEXT: vmovdqa64 %zmm14, %zmm5 4068; AVX512-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 4069; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 4070; AVX512-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 4071; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] 4072; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 4073; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] 4074; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 4075; AVX512-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 4076; AVX512-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 4077; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] 4078; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 4079; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] 4080; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 4081; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] 4082; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4083; AVX512-NEXT: vmovdqa64 %zmm8, %zmm14 4084; AVX512-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 4085; AVX512-NEXT: vmovdqa64 %zmm6, %zmm15 4086; AVX512-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 4087; AVX512-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} 4088; AVX512-NEXT: vmovdqa (%rcx), %ymm14 4089; AVX512-NEXT: vmovdqa64 64(%rcx), %ymm23 4090; AVX512-NEXT: vmovdqa64 (%rdx), %ymm24 4091; AVX512-NEXT: vmovdqa64 64(%rdx), %ymm25 4092; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] 4093; AVX512-NEXT: vmovdqa64 (%rsi), %ymm26 4094; AVX512-NEXT: vmovdqa64 64(%rsi), %ymm27 4095; AVX512-NEXT: vmovdqa64 (%rdi), %ymm28 4096; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm30 4097; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] 4098; AVX512-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] 4099; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 4100; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] 4101; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4102; AVX512-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 4103; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 4104; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} 4105; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] 4106; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] 4107; AVX512-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] 4108; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 4109; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 4110; AVX512-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 4111; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 4112; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} 4113; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] 4114; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] 4115; AVX512-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] 4116; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 4117; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 4118; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 4119; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} 4120; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] 4121; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] 4122; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 4123; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 4124; AVX512-NEXT: vmovdqa64 %zmm0, 640(%rax) 4125; AVX512-NEXT: vmovdqa64 %zmm7, 704(%rax) 4126; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rax) 4127; AVX512-NEXT: vmovdqa64 %zmm10, 192(%rax) 4128; AVX512-NEXT: vmovdqa64 %zmm11, 896(%rax) 4129; AVX512-NEXT: vmovdqa64 %zmm5, 960(%rax) 4130; AVX512-NEXT: vmovdqa64 %zmm2, 768(%rax) 4131; AVX512-NEXT: vmovdqa64 %zmm20, 832(%rax) 4132; AVX512-NEXT: vmovdqa64 %zmm19, 512(%rax) 4133; AVX512-NEXT: vmovdqa64 %zmm18, 576(%rax) 4134; AVX512-NEXT: vmovdqa64 %zmm17, 384(%rax) 4135; AVX512-NEXT: vmovdqa64 %zmm16, 448(%rax) 4136; AVX512-NEXT: vmovdqa64 %zmm31, 256(%rax) 4137; AVX512-NEXT: vmovdqa64 %zmm29, 320(%rax) 4138; AVX512-NEXT: vmovdqa64 %zmm22, (%rax) 4139; AVX512-NEXT: vmovdqa64 %zmm21, 64(%rax) 4140; AVX512-NEXT: vzeroupper 4141; AVX512-NEXT: retq 4142; 4143; AVX512-FCP-LABEL: store_i64_stride8_vf16: 4144; AVX512-FCP: # %bb.0: 4145; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4146; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 4147; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 4148; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 4149; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm15 4150; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm27 4151; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm17 4152; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 4153; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 4154; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm28 4155; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm26 4156; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm6 4157; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 4158; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm7 4159; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm1 4160; AVX512-FCP-NEXT: vmovdqa64 (%r11), %zmm8 4161; AVX512-FCP-NEXT: vmovdqa64 64(%r11), %zmm3 4162; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm9 4163; AVX512-FCP-NEXT: vmovdqa64 64(%r10), %zmm4 4164; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] 4165; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4166; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 4167; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 4168; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 4169; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 4170; AVX512-FCP-NEXT: movb $-64, %r8b 4171; AVX512-FCP-NEXT: kmovw %r8d, %k1 4172; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} 4173; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm10 4174; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 4175; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %xmm16 4176; AVX512-FCP-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 4177; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] 4178; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 4179; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] 4180; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4181; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 4182; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 4183; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 4184; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 4185; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} 4186; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] 4187; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 4188; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] 4189; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4190; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 4191; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 4192; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 4193; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 4194; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] 4195; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 4196; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 4197; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] 4198; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} 4199; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 4200; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] 4201; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4202; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 4203; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 4204; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 4205; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 4206; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] 4207; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 4208; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 4209; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] 4210; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} 4211; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 4212; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] 4213; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4214; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 4215; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 4216; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 4217; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 4218; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] 4219; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 4220; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 4221; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] 4222; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 4223; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] 4224; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4225; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 4226; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 4227; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] 4228; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 4229; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 4230; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] 4231; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 4232; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 4233; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 4234; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 4235; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} 4236; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 4237; AVX512-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 4238; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 4239; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 4240; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] 4241; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 4242; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 4243; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 4244; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 4245; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} 4246; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] 4247; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 4248; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 4249; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 4250; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 4251; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 4252; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 4253; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 4254; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 4255; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} 4256; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 4257; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 4258; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 4259; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 4260; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 4261; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 4262; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 4263; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 4264; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} 4265; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 4266; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 4267; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 4268; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 4269; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 4270; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] 4271; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 4272; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] 4273; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 4274; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 4275; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 4276; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] 4277; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 4278; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] 4279; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 4280; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] 4281; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4282; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 4283; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 4284; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 4285; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 4286; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} 4287; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm14 4288; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %ymm23 4289; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %ymm24 4290; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %ymm25 4291; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] 4292; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %ymm26 4293; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %ymm27 4294; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 4295; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 4296; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] 4297; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] 4298; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 4299; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] 4300; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4301; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 4302; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 4303; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} 4304; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] 4305; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] 4306; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] 4307; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 4308; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 4309; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 4310; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 4311; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} 4312; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] 4313; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] 4314; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] 4315; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 4316; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 4317; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 4318; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} 4319; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] 4320; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] 4321; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 4322; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 4323; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 640(%rax) 4324; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 704(%rax) 4325; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) 4326; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) 4327; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 896(%rax) 4328; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 960(%rax) 4329; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 768(%rax) 4330; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 832(%rax) 4331; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) 4332; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 576(%rax) 4333; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 384(%rax) 4334; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) 4335; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 256(%rax) 4336; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 320(%rax) 4337; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%rax) 4338; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax) 4339; AVX512-FCP-NEXT: vzeroupper 4340; AVX512-FCP-NEXT: retq 4341; 4342; AVX512DQ-LABEL: store_i64_stride8_vf16: 4343; AVX512DQ: # %bb.0: 4344; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 4345; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 4346; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 4347; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm11 4348; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm15 4349; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm27 4350; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm17 4351; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm14 4352; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm5 4353; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm28 4354; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm26 4355; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm6 4356; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm0 4357; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm7 4358; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm1 4359; AVX512DQ-NEXT: vmovdqa64 (%r11), %zmm8 4360; AVX512DQ-NEXT: vmovdqa64 64(%r11), %zmm3 4361; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm9 4362; AVX512DQ-NEXT: vmovdqa64 64(%r10), %zmm4 4363; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] 4364; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4365; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 4366; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 4367; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 4368; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 4369; AVX512DQ-NEXT: movb $-64, %r8b 4370; AVX512DQ-NEXT: kmovw %r8d, %k1 4371; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} 4372; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm10 4373; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 4374; AVX512DQ-NEXT: vmovdqa64 (%rdi), %xmm16 4375; AVX512DQ-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 4376; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] 4377; AVX512DQ-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 4378; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] 4379; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4380; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 4381; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 4382; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm20 4383; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 4384; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} 4385; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] 4386; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 4387; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] 4388; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4389; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 4390; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 4391; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm12 4392; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 4393; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] 4394; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 4395; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 4396; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] 4397; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} 4398; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 4399; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] 4400; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4401; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 4402; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 4403; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm2 4404; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 4405; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] 4406; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 4407; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 4408; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] 4409; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} 4410; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 4411; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] 4412; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4413; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 4414; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 4415; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm13 4416; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 4417; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] 4418; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 4419; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 4420; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] 4421; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 4422; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] 4423; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4424; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 4425; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 4426; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] 4427; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 4428; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 4429; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] 4430; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 4431; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 4432; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 4433; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 4434; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} 4435; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm2 4436; AVX512DQ-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 4437; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5 4438; AVX512DQ-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 4439; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] 4440; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 4441; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 4442; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 4443; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 4444; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} 4445; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] 4446; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 4447; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 4448; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 4449; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 4450; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 4451; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 4452; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 4453; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 4454; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} 4455; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 4456; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 4457; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 4458; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 4459; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 4460; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 4461; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 4462; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 4463; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} 4464; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 4465; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm5 4466; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 4467; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm13 4468; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 4469; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] 4470; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 4471; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] 4472; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 4473; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 4474; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 4475; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] 4476; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 4477; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] 4478; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 4479; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] 4480; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4481; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm14 4482; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 4483; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm15 4484; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 4485; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} 4486; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm14 4487; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %ymm23 4488; AVX512DQ-NEXT: vmovdqa64 (%rdx), %ymm24 4489; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %ymm25 4490; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] 4491; AVX512DQ-NEXT: vmovdqa64 (%rsi), %ymm26 4492; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %ymm27 4493; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm28 4494; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm30 4495; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] 4496; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] 4497; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 4498; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] 4499; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4500; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 4501; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 4502; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} 4503; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] 4504; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] 4505; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] 4506; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 4507; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 4508; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 4509; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 4510; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} 4511; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] 4512; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] 4513; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] 4514; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 4515; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 4516; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 4517; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} 4518; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] 4519; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] 4520; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 4521; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 4522; AVX512DQ-NEXT: vmovdqa64 %zmm0, 640(%rax) 4523; AVX512DQ-NEXT: vmovdqa64 %zmm7, 704(%rax) 4524; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rax) 4525; AVX512DQ-NEXT: vmovdqa64 %zmm10, 192(%rax) 4526; AVX512DQ-NEXT: vmovdqa64 %zmm11, 896(%rax) 4527; AVX512DQ-NEXT: vmovdqa64 %zmm5, 960(%rax) 4528; AVX512DQ-NEXT: vmovdqa64 %zmm2, 768(%rax) 4529; AVX512DQ-NEXT: vmovdqa64 %zmm20, 832(%rax) 4530; AVX512DQ-NEXT: vmovdqa64 %zmm19, 512(%rax) 4531; AVX512DQ-NEXT: vmovdqa64 %zmm18, 576(%rax) 4532; AVX512DQ-NEXT: vmovdqa64 %zmm17, 384(%rax) 4533; AVX512DQ-NEXT: vmovdqa64 %zmm16, 448(%rax) 4534; AVX512DQ-NEXT: vmovdqa64 %zmm31, 256(%rax) 4535; AVX512DQ-NEXT: vmovdqa64 %zmm29, 320(%rax) 4536; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%rax) 4537; AVX512DQ-NEXT: vmovdqa64 %zmm21, 64(%rax) 4538; AVX512DQ-NEXT: vzeroupper 4539; AVX512DQ-NEXT: retq 4540; 4541; AVX512DQ-FCP-LABEL: store_i64_stride8_vf16: 4542; AVX512DQ-FCP: # %bb.0: 4543; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4544; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 4545; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 4546; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 4547; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm15 4548; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm27 4549; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm17 4550; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 4551; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 4552; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm28 4553; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm26 4554; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm6 4555; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 4556; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm7 4557; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm1 4558; AVX512DQ-FCP-NEXT: vmovdqa64 (%r11), %zmm8 4559; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r11), %zmm3 4560; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm9 4561; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r10), %zmm4 4562; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] 4563; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4564; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 4565; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 4566; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 4567; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 4568; AVX512DQ-FCP-NEXT: movb $-64, %r8b 4569; AVX512DQ-FCP-NEXT: kmovw %r8d, %k1 4570; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} 4571; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm10 4572; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 4573; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %xmm16 4574; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 4575; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] 4576; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 4577; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] 4578; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4579; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 4580; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 4581; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 4582; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 4583; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} 4584; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] 4585; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 4586; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] 4587; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4588; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 4589; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 4590; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 4591; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 4592; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] 4593; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 4594; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 4595; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] 4596; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} 4597; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 4598; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] 4599; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4600; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 4601; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 4602; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 4603; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 4604; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] 4605; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 4606; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 4607; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] 4608; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} 4609; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 4610; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] 4611; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4612; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 4613; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 4614; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 4615; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 4616; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] 4617; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 4618; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 4619; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] 4620; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 4621; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] 4622; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4623; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 4624; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 4625; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] 4626; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 4627; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 4628; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] 4629; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 4630; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 4631; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 4632; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 4633; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} 4634; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 4635; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 4636; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 4637; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 4638; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] 4639; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 4640; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 4641; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 4642; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 4643; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} 4644; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] 4645; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 4646; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 4647; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 4648; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 4649; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 4650; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 4651; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 4652; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 4653; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} 4654; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 4655; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 4656; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 4657; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 4658; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 4659; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 4660; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 4661; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 4662; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} 4663; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 4664; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 4665; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 4666; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 4667; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 4668; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] 4669; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 4670; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] 4671; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 4672; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 4673; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 4674; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] 4675; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 4676; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] 4677; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 4678; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] 4679; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4680; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 4681; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 4682; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 4683; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 4684; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} 4685; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm14 4686; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %ymm23 4687; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %ymm24 4688; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %ymm25 4689; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] 4690; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %ymm26 4691; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %ymm27 4692; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 4693; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 4694; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] 4695; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] 4696; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 4697; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] 4698; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4699; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 4700; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 4701; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} 4702; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] 4703; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] 4704; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] 4705; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 4706; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 4707; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 4708; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 4709; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} 4710; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] 4711; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] 4712; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] 4713; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 4714; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 4715; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 4716; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} 4717; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] 4718; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] 4719; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 4720; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 4721; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 640(%rax) 4722; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 704(%rax) 4723; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) 4724; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) 4725; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 896(%rax) 4726; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 960(%rax) 4727; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 768(%rax) 4728; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 832(%rax) 4729; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) 4730; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 576(%rax) 4731; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 384(%rax) 4732; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) 4733; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 256(%rax) 4734; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 320(%rax) 4735; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%rax) 4736; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax) 4737; AVX512DQ-FCP-NEXT: vzeroupper 4738; AVX512DQ-FCP-NEXT: retq 4739; 4740; AVX512BW-LABEL: store_i64_stride8_vf16: 4741; AVX512BW: # %bb.0: 4742; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 4743; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 4744; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 4745; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11 4746; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm15 4747; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm27 4748; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm17 4749; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm14 4750; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 4751; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm28 4752; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm26 4753; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm6 4754; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm0 4755; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm7 4756; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm1 4757; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm8 4758; AVX512BW-NEXT: vmovdqa64 64(%r11), %zmm3 4759; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm9 4760; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm4 4761; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] 4762; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4763; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 4764; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 4765; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 4766; AVX512BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 4767; AVX512BW-NEXT: movb $-64, %r8b 4768; AVX512BW-NEXT: kmovd %r8d, %k1 4769; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} 4770; AVX512BW-NEXT: vmovdqa (%rsi), %xmm10 4771; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 4772; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm16 4773; AVX512BW-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 4774; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] 4775; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 4776; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] 4777; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4778; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 4779; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 4780; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm20 4781; AVX512BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 4782; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} 4783; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] 4784; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 4785; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] 4786; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4787; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 4788; AVX512BW-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 4789; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 4790; AVX512BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 4791; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] 4792; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 4793; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 4794; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] 4795; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} 4796; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 4797; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] 4798; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4799; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 4800; AVX512BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 4801; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 4802; AVX512BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 4803; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] 4804; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 4805; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 4806; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] 4807; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} 4808; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 4809; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] 4810; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4811; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 4812; AVX512BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 4813; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 4814; AVX512BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 4815; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] 4816; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 4817; AVX512BW-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 4818; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] 4819; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 4820; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] 4821; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4822; AVX512BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 4823; AVX512BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 4824; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] 4825; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 4826; AVX512BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 4827; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] 4828; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 4829; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 4830; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 4831; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 4832; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} 4833; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm2 4834; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 4835; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm5 4836; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 4837; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] 4838; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 4839; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 4840; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 4841; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 4842; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} 4843; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] 4844; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 4845; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 4846; AVX512BW-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 4847; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 4848; AVX512BW-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 4849; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 4850; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 4851; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 4852; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} 4853; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 4854; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 4855; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 4856; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 4857; AVX512BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 4858; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 4859; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 4860; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 4861; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} 4862; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 4863; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm5 4864; AVX512BW-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 4865; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 4866; AVX512BW-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 4867; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] 4868; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 4869; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] 4870; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 4871; AVX512BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 4872; AVX512BW-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 4873; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] 4874; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 4875; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] 4876; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 4877; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] 4878; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4879; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 4880; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 4881; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 4882; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 4883; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} 4884; AVX512BW-NEXT: vmovdqa (%rcx), %ymm14 4885; AVX512BW-NEXT: vmovdqa64 64(%rcx), %ymm23 4886; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm24 4887; AVX512BW-NEXT: vmovdqa64 64(%rdx), %ymm25 4888; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] 4889; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm26 4890; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm27 4891; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm28 4892; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm30 4893; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] 4894; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] 4895; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 4896; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] 4897; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4898; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 4899; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 4900; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} 4901; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] 4902; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] 4903; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] 4904; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 4905; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 4906; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 4907; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 4908; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} 4909; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] 4910; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] 4911; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] 4912; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 4913; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 4914; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 4915; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} 4916; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] 4917; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] 4918; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 4919; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 4920; AVX512BW-NEXT: vmovdqa64 %zmm0, 640(%rax) 4921; AVX512BW-NEXT: vmovdqa64 %zmm7, 704(%rax) 4922; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) 4923; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) 4924; AVX512BW-NEXT: vmovdqa64 %zmm11, 896(%rax) 4925; AVX512BW-NEXT: vmovdqa64 %zmm5, 960(%rax) 4926; AVX512BW-NEXT: vmovdqa64 %zmm2, 768(%rax) 4927; AVX512BW-NEXT: vmovdqa64 %zmm20, 832(%rax) 4928; AVX512BW-NEXT: vmovdqa64 %zmm19, 512(%rax) 4929; AVX512BW-NEXT: vmovdqa64 %zmm18, 576(%rax) 4930; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%rax) 4931; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%rax) 4932; AVX512BW-NEXT: vmovdqa64 %zmm31, 256(%rax) 4933; AVX512BW-NEXT: vmovdqa64 %zmm29, 320(%rax) 4934; AVX512BW-NEXT: vmovdqa64 %zmm22, (%rax) 4935; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rax) 4936; AVX512BW-NEXT: vzeroupper 4937; AVX512BW-NEXT: retq 4938; 4939; AVX512BW-FCP-LABEL: store_i64_stride8_vf16: 4940; AVX512BW-FCP: # %bb.0: 4941; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4942; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 4943; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 4944; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 4945; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm15 4946; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm27 4947; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm17 4948; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 4949; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 4950; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm28 4951; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm26 4952; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm6 4953; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 4954; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm7 4955; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm1 4956; AVX512BW-FCP-NEXT: vmovdqa64 (%r11), %zmm8 4957; AVX512BW-FCP-NEXT: vmovdqa64 64(%r11), %zmm3 4958; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm9 4959; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm4 4960; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] 4961; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4962; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 4963; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 4964; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 4965; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 4966; AVX512BW-FCP-NEXT: movb $-64, %r8b 4967; AVX512BW-FCP-NEXT: kmovd %r8d, %k1 4968; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} 4969; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm10 4970; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 4971; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm16 4972; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 4973; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] 4974; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 4975; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] 4976; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4977; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 4978; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 4979; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 4980; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 4981; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} 4982; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] 4983; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 4984; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] 4985; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4986; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 4987; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 4988; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 4989; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 4990; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] 4991; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 4992; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 4993; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] 4994; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} 4995; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 4996; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] 4997; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4998; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 4999; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 5000; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 5001; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 5002; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] 5003; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 5004; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 5005; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] 5006; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} 5007; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 5008; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] 5009; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5010; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 5011; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 5012; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 5013; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 5014; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] 5015; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 5016; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 5017; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] 5018; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 5019; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] 5020; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5021; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 5022; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 5023; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] 5024; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 5025; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 5026; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] 5027; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 5028; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 5029; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 5030; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 5031; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} 5032; AVX512BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 5033; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 5034; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 5035; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 5036; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] 5037; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 5038; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 5039; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 5040; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 5041; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} 5042; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] 5043; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 5044; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 5045; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 5046; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 5047; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 5048; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 5049; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 5050; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 5051; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} 5052; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 5053; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 5054; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 5055; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 5056; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 5057; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 5058; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 5059; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 5060; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} 5061; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 5062; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 5063; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 5064; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 5065; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 5066; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] 5067; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 5068; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] 5069; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 5070; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 5071; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 5072; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] 5073; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 5074; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] 5075; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 5076; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] 5077; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5078; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 5079; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 5080; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 5081; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 5082; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} 5083; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm14 5084; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %ymm23 5085; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm24 5086; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %ymm25 5087; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] 5088; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm26 5089; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %ymm27 5090; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 5091; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 5092; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] 5093; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] 5094; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 5095; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] 5096; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5097; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 5098; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 5099; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} 5100; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] 5101; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] 5102; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] 5103; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 5104; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 5105; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 5106; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 5107; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} 5108; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] 5109; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] 5110; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] 5111; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 5112; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 5113; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 5114; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} 5115; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] 5116; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] 5117; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 5118; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 5119; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 640(%rax) 5120; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 704(%rax) 5121; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) 5122; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) 5123; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 896(%rax) 5124; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 960(%rax) 5125; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 768(%rax) 5126; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 832(%rax) 5127; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) 5128; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 576(%rax) 5129; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 384(%rax) 5130; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) 5131; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 256(%rax) 5132; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 320(%rax) 5133; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, (%rax) 5134; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax) 5135; AVX512BW-FCP-NEXT: vzeroupper 5136; AVX512BW-FCP-NEXT: retq 5137; 5138; AVX512DQ-BW-LABEL: store_i64_stride8_vf16: 5139; AVX512DQ-BW: # %bb.0: 5140; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 5141; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 5142; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 5143; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm11 5144; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm15 5145; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm27 5146; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm17 5147; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm14 5148; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm5 5149; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm28 5150; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm26 5151; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm6 5152; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm0 5153; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm7 5154; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm1 5155; AVX512DQ-BW-NEXT: vmovdqa64 (%r11), %zmm8 5156; AVX512DQ-BW-NEXT: vmovdqa64 64(%r11), %zmm3 5157; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm9 5158; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm4 5159; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] 5160; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5161; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 5162; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 5163; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 5164; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 5165; AVX512DQ-BW-NEXT: movb $-64, %r8b 5166; AVX512DQ-BW-NEXT: kmovd %r8d, %k1 5167; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} 5168; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm10 5169; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 5170; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm16 5171; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 5172; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] 5173; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 5174; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] 5175; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5176; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 5177; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 5178; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm20 5179; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 5180; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} 5181; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] 5182; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 5183; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] 5184; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5185; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 5186; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 5187; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm12 5188; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 5189; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] 5190; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 5191; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 5192; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] 5193; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} 5194; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 5195; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] 5196; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5197; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 5198; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 5199; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm2 5200; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 5201; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] 5202; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 5203; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 5204; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] 5205; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} 5206; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 5207; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] 5208; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5209; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 5210; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 5211; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm13 5212; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 5213; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] 5214; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 5215; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 5216; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] 5217; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 5218; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] 5219; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5220; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 5221; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 5222; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] 5223; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5 5224; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 5225; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] 5226; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 5227; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 5228; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 5229; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 5230; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} 5231; AVX512DQ-BW-NEXT: vmovdqa 64(%rsi), %xmm2 5232; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 5233; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm5 5234; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 5235; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] 5236; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 5237; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 5238; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 5239; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 5240; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} 5241; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] 5242; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 5243; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 5244; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 5245; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 5246; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 5247; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 5248; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 5249; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 5250; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} 5251; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 5252; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 5253; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 5254; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 5255; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 5256; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 5257; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 5258; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 5259; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} 5260; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 5261; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm5 5262; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 5263; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13 5264; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 5265; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] 5266; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 5267; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] 5268; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 5269; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 5270; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 5271; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] 5272; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 5273; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] 5274; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 5275; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] 5276; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5277; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm14 5278; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 5279; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm15 5280; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 5281; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} 5282; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm14 5283; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %ymm23 5284; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %ymm24 5285; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %ymm25 5286; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] 5287; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %ymm26 5288; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %ymm27 5289; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm28 5290; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm30 5291; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] 5292; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] 5293; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 5294; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] 5295; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5296; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 5297; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 5298; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} 5299; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] 5300; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] 5301; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] 5302; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 5303; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 5304; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 5305; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 5306; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} 5307; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] 5308; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] 5309; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] 5310; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 5311; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 5312; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 5313; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} 5314; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] 5315; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] 5316; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 5317; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 5318; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 640(%rax) 5319; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 704(%rax) 5320; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rax) 5321; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 192(%rax) 5322; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 896(%rax) 5323; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 960(%rax) 5324; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 768(%rax) 5325; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 832(%rax) 5326; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 512(%rax) 5327; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 576(%rax) 5328; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 384(%rax) 5329; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 448(%rax) 5330; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 256(%rax) 5331; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 320(%rax) 5332; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, (%rax) 5333; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 64(%rax) 5334; AVX512DQ-BW-NEXT: vzeroupper 5335; AVX512DQ-BW-NEXT: retq 5336; 5337; AVX512DQ-BW-FCP-LABEL: store_i64_stride8_vf16: 5338; AVX512DQ-BW-FCP: # %bb.0: 5339; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 5340; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 5341; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 5342; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 5343; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm15 5344; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm27 5345; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm17 5346; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 5347; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 5348; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm28 5349; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm26 5350; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm6 5351; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 5352; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm7 5353; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm1 5354; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r11), %zmm8 5355; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r11), %zmm3 5356; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm9 5357; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm4 5358; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] 5359; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5360; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 5361; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 5362; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 5363; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 5364; AVX512DQ-BW-FCP-NEXT: movb $-64, %r8b 5365; AVX512DQ-BW-FCP-NEXT: kmovd %r8d, %k1 5366; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} 5367; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm10 5368; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 5369; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm16 5370; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 5371; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] 5372; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 5373; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] 5374; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5375; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 5376; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 5377; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 5378; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 5379; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} 5380; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] 5381; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 5382; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] 5383; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5384; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 5385; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 5386; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 5387; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 5388; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] 5389; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 5390; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 5391; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] 5392; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} 5393; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 5394; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] 5395; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5396; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 5397; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 5398; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 5399; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 5400; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] 5401; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 5402; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 5403; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] 5404; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} 5405; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 5406; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] 5407; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5408; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 5409; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 5410; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 5411; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 5412; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] 5413; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 5414; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 5415; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] 5416; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 5417; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] 5418; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5419; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 5420; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 5421; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] 5422; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 5423; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 5424; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] 5425; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 5426; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 5427; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 5428; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 5429; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} 5430; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 5431; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 5432; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 5433; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 5434; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] 5435; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 5436; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 5437; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 5438; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 5439; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} 5440; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] 5441; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 5442; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 5443; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 5444; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 5445; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 5446; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 5447; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 5448; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 5449; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} 5450; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 5451; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 5452; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 5453; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 5454; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 5455; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 5456; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 5457; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 5458; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} 5459; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 5460; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 5461; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 5462; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 5463; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 5464; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] 5465; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 5466; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] 5467; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 5468; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 5469; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 5470; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] 5471; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 5472; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] 5473; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 5474; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] 5475; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5476; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 5477; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 5478; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 5479; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 5480; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} 5481; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm14 5482; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %ymm23 5483; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm24 5484; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %ymm25 5485; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] 5486; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm26 5487; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %ymm27 5488; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 5489; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 5490; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] 5491; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] 5492; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 5493; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] 5494; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5495; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 5496; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 5497; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} 5498; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] 5499; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] 5500; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] 5501; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 5502; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 5503; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 5504; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 5505; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} 5506; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] 5507; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] 5508; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] 5509; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 5510; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 5511; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 5512; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} 5513; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] 5514; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] 5515; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 5516; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 5517; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 640(%rax) 5518; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 704(%rax) 5519; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) 5520; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) 5521; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 896(%rax) 5522; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 960(%rax) 5523; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 768(%rax) 5524; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 832(%rax) 5525; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) 5526; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 576(%rax) 5527; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 384(%rax) 5528; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) 5529; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 256(%rax) 5530; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 320(%rax) 5531; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, (%rax) 5532; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax) 5533; AVX512DQ-BW-FCP-NEXT: vzeroupper 5534; AVX512DQ-BW-FCP-NEXT: retq 5535 %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 5536 %in.vec1 = load <16 x i64>, ptr %in.vecptr1, align 64 5537 %in.vec2 = load <16 x i64>, ptr %in.vecptr2, align 64 5538 %in.vec3 = load <16 x i64>, ptr %in.vecptr3, align 64 5539 %in.vec4 = load <16 x i64>, ptr %in.vecptr4, align 64 5540 %in.vec5 = load <16 x i64>, ptr %in.vecptr5, align 64 5541 %in.vec6 = load <16 x i64>, ptr %in.vecptr6, align 64 5542 %in.vec7 = load <16 x i64>, ptr %in.vecptr7, align 64 5543 %1 = shufflevector <16 x i64> %in.vec0, <16 x i64> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 5544 %2 = shufflevector <16 x i64> %in.vec2, <16 x i64> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 5545 %3 = shufflevector <16 x i64> %in.vec4, <16 x i64> %in.vec5, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 5546 %4 = shufflevector <16 x i64> %in.vec6, <16 x i64> %in.vec7, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 5547 %5 = shufflevector <32 x i64> %1, <32 x i64> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 5548 %6 = shufflevector <32 x i64> %3, <32 x i64> %4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 5549 %7 = shufflevector <64 x i64> %5, <64 x i64> %6, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 5550 %interleaved.vec = shufflevector <128 x i64> %7, <128 x i64> poison, <128 x i32> <i32 0, i32 16, i32 32, i32 48, i32 64, i32 80, i32 96, i32 112, i32 1, i32 17, i32 33, i32 49, i32 65, i32 81, i32 97, i32 113, i32 2, i32 18, i32 34, i32 50, i32 66, i32 82, i32 98, i32 114, i32 3, i32 19, i32 35, i32 51, i32 67, i32 83, i32 99, i32 115, i32 4, i32 20, i32 36, i32 52, i32 68, i32 84, i32 100, i32 116, i32 5, i32 21, i32 37, i32 53, i32 69, i32 85, i32 101, i32 117, i32 6, i32 22, i32 38, i32 54, i32 70, i32 86, i32 102, i32 118, i32 7, i32 23, i32 39, i32 55, i32 71, i32 87, i32 103, i32 119, i32 8, i32 24, i32 40, i32 56, i32 72, i32 88, i32 104, i32 120, i32 9, i32 25, i32 41, i32 57, i32 73, i32 89, i32 105, i32 121, i32 10, i32 26, i32 42, i32 58, i32 74, i32 90, i32 106, i32 122, i32 11, i32 27, i32 43, i32 59, i32 75, i32 91, i32 107, i32 123, i32 12, i32 28, i32 44, i32 60, i32 76, i32 92, i32 108, i32 124, i32 13, i32 29, i32 45, i32 61, i32 77, i32 93, i32 109, i32 125, i32 14, i32 30, i32 46, i32 62, i32 78, i32 94, i32 110, i32 126, i32 15, i32 31, i32 47, i32 63, i32 79, i32 95, i32 111, i32 127> 5551 store <128 x i64> %interleaved.vec, ptr %out.vec, align 64 5552 ret void 5553} 5554 5555define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { 5556; SSE-LABEL: store_i64_stride8_vf32: 5557; SSE: # %bb.0: 5558; SSE-NEXT: subq $1688, %rsp # imm = 0x698 5559; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 5560; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 5561; SSE-NEXT: movaps (%rdi), %xmm7 5562; SSE-NEXT: movaps 16(%rdi), %xmm8 5563; SSE-NEXT: movaps (%rsi), %xmm1 5564; SSE-NEXT: movaps 16(%rsi), %xmm0 5565; SSE-NEXT: movaps (%rdx), %xmm9 5566; SSE-NEXT: movaps 16(%rdx), %xmm10 5567; SSE-NEXT: movaps (%rcx), %xmm3 5568; SSE-NEXT: movaps 16(%rcx), %xmm2 5569; SSE-NEXT: movaps (%r8), %xmm11 5570; SSE-NEXT: movaps 16(%r8), %xmm12 5571; SSE-NEXT: movaps (%r9), %xmm5 5572; SSE-NEXT: movaps 16(%r9), %xmm4 5573; SSE-NEXT: movaps (%r10), %xmm13 5574; SSE-NEXT: movaps 16(%r10), %xmm14 5575; SSE-NEXT: movaps (%rax), %xmm6 5576; SSE-NEXT: movaps %xmm7, %xmm15 5577; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm1[0] 5578; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5579; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] 5580; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5581; SSE-NEXT: movaps %xmm9, %xmm1 5582; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] 5583; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5584; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm3[1] 5585; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5586; SSE-NEXT: movaps %xmm11, %xmm1 5587; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] 5588; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5589; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm5[1] 5590; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5591; SSE-NEXT: movaps %xmm13, %xmm1 5592; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] 5593; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5594; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] 5595; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5596; SSE-NEXT: movaps %xmm8, %xmm1 5597; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5598; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5599; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] 5600; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5601; SSE-NEXT: movaps %xmm10, %xmm0 5602; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 5603; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5604; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] 5605; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5606; SSE-NEXT: movaps %xmm12, %xmm0 5607; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] 5608; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5609; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] 5610; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5611; SSE-NEXT: movaps 16(%rax), %xmm0 5612; SSE-NEXT: movaps %xmm14, %xmm1 5613; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5614; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5615; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] 5616; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5617; SSE-NEXT: movaps 32(%rdi), %xmm2 5618; SSE-NEXT: movaps 32(%rsi), %xmm0 5619; SSE-NEXT: movaps %xmm2, %xmm1 5620; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5621; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5622; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5623; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5624; SSE-NEXT: movaps 32(%rdx), %xmm2 5625; SSE-NEXT: movaps 32(%rcx), %xmm0 5626; SSE-NEXT: movaps %xmm2, %xmm1 5627; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5628; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5629; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5630; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5631; SSE-NEXT: movaps 32(%r8), %xmm2 5632; SSE-NEXT: movaps 32(%r9), %xmm0 5633; SSE-NEXT: movaps %xmm2, %xmm1 5634; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5635; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5636; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5637; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5638; SSE-NEXT: movaps 32(%r10), %xmm2 5639; SSE-NEXT: movaps 32(%rax), %xmm0 5640; SSE-NEXT: movaps %xmm2, %xmm1 5641; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5642; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5643; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5644; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5645; SSE-NEXT: movaps 48(%rdi), %xmm2 5646; SSE-NEXT: movaps 48(%rsi), %xmm0 5647; SSE-NEXT: movaps %xmm2, %xmm1 5648; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5649; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5650; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5651; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5652; SSE-NEXT: movaps 48(%rdx), %xmm2 5653; SSE-NEXT: movaps 48(%rcx), %xmm0 5654; SSE-NEXT: movaps %xmm2, %xmm1 5655; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5656; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5657; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5658; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5659; SSE-NEXT: movaps 48(%r8), %xmm2 5660; SSE-NEXT: movaps 48(%r9), %xmm0 5661; SSE-NEXT: movaps %xmm2, %xmm1 5662; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5663; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5664; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5665; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5666; SSE-NEXT: movaps 48(%r10), %xmm2 5667; SSE-NEXT: movaps 48(%rax), %xmm0 5668; SSE-NEXT: movaps %xmm2, %xmm1 5669; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5670; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5671; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5672; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5673; SSE-NEXT: movaps 64(%rdi), %xmm2 5674; SSE-NEXT: movaps 64(%rsi), %xmm0 5675; SSE-NEXT: movaps %xmm2, %xmm1 5676; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5677; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5678; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5679; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5680; SSE-NEXT: movaps 64(%rdx), %xmm2 5681; SSE-NEXT: movaps 64(%rcx), %xmm0 5682; SSE-NEXT: movaps %xmm2, %xmm1 5683; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5684; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5685; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5686; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5687; SSE-NEXT: movaps 64(%r8), %xmm2 5688; SSE-NEXT: movaps 64(%r9), %xmm0 5689; SSE-NEXT: movaps %xmm2, %xmm1 5690; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5691; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5692; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5693; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5694; SSE-NEXT: movaps 64(%r10), %xmm2 5695; SSE-NEXT: movaps 64(%rax), %xmm0 5696; SSE-NEXT: movaps %xmm2, %xmm1 5697; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5698; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5699; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5700; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5701; SSE-NEXT: movaps 80(%rdi), %xmm2 5702; SSE-NEXT: movaps 80(%rsi), %xmm0 5703; SSE-NEXT: movaps %xmm2, %xmm1 5704; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5705; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5706; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5707; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5708; SSE-NEXT: movaps 80(%rdx), %xmm2 5709; SSE-NEXT: movaps 80(%rcx), %xmm0 5710; SSE-NEXT: movaps %xmm2, %xmm1 5711; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5712; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5713; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5714; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5715; SSE-NEXT: movaps 80(%r8), %xmm2 5716; SSE-NEXT: movaps 80(%r9), %xmm0 5717; SSE-NEXT: movaps %xmm2, %xmm1 5718; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5719; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5720; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5721; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5722; SSE-NEXT: movaps 80(%r10), %xmm2 5723; SSE-NEXT: movaps 80(%rax), %xmm0 5724; SSE-NEXT: movaps %xmm2, %xmm1 5725; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5726; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5727; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5728; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5729; SSE-NEXT: movaps 96(%rdi), %xmm2 5730; SSE-NEXT: movaps 96(%rsi), %xmm0 5731; SSE-NEXT: movaps %xmm2, %xmm1 5732; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5733; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5734; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5735; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5736; SSE-NEXT: movaps 96(%rdx), %xmm2 5737; SSE-NEXT: movaps 96(%rcx), %xmm0 5738; SSE-NEXT: movaps %xmm2, %xmm1 5739; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5740; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5741; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5742; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5743; SSE-NEXT: movaps 96(%r8), %xmm2 5744; SSE-NEXT: movaps 96(%r9), %xmm0 5745; SSE-NEXT: movaps %xmm2, %xmm1 5746; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5747; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5748; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5749; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5750; SSE-NEXT: movaps 96(%r10), %xmm2 5751; SSE-NEXT: movaps 96(%rax), %xmm0 5752; SSE-NEXT: movaps %xmm2, %xmm1 5753; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5754; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5755; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5756; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5757; SSE-NEXT: movaps 112(%rdi), %xmm2 5758; SSE-NEXT: movaps 112(%rsi), %xmm0 5759; SSE-NEXT: movaps %xmm2, %xmm1 5760; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5761; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5762; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5763; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5764; SSE-NEXT: movaps 112(%rdx), %xmm2 5765; SSE-NEXT: movaps 112(%rcx), %xmm0 5766; SSE-NEXT: movaps %xmm2, %xmm1 5767; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5768; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5769; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5770; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5771; SSE-NEXT: movaps 112(%r8), %xmm2 5772; SSE-NEXT: movaps 112(%r9), %xmm0 5773; SSE-NEXT: movaps %xmm2, %xmm1 5774; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5775; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5776; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5777; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5778; SSE-NEXT: movaps 112(%r10), %xmm2 5779; SSE-NEXT: movaps 112(%rax), %xmm0 5780; SSE-NEXT: movaps %xmm2, %xmm1 5781; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5782; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5783; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5784; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5785; SSE-NEXT: movaps 128(%rdi), %xmm2 5786; SSE-NEXT: movaps 128(%rsi), %xmm0 5787; SSE-NEXT: movaps %xmm2, %xmm1 5788; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5789; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5790; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5791; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5792; SSE-NEXT: movaps 128(%rdx), %xmm2 5793; SSE-NEXT: movaps 128(%rcx), %xmm0 5794; SSE-NEXT: movaps %xmm2, %xmm1 5795; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5796; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5797; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5798; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5799; SSE-NEXT: movaps 128(%r8), %xmm2 5800; SSE-NEXT: movaps 128(%r9), %xmm0 5801; SSE-NEXT: movaps %xmm2, %xmm1 5802; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5803; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5804; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5805; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5806; SSE-NEXT: movaps 128(%r10), %xmm2 5807; SSE-NEXT: movaps 128(%rax), %xmm0 5808; SSE-NEXT: movaps %xmm2, %xmm1 5809; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5810; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5811; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5812; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5813; SSE-NEXT: movaps 144(%rdi), %xmm2 5814; SSE-NEXT: movaps 144(%rsi), %xmm0 5815; SSE-NEXT: movaps %xmm2, %xmm1 5816; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5817; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5818; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5819; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5820; SSE-NEXT: movaps 144(%rdx), %xmm2 5821; SSE-NEXT: movaps 144(%rcx), %xmm0 5822; SSE-NEXT: movaps %xmm2, %xmm1 5823; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5824; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5825; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5826; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5827; SSE-NEXT: movaps 144(%r8), %xmm2 5828; SSE-NEXT: movaps 144(%r9), %xmm0 5829; SSE-NEXT: movaps %xmm2, %xmm1 5830; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5831; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5832; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5833; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5834; SSE-NEXT: movaps 144(%r10), %xmm2 5835; SSE-NEXT: movaps 144(%rax), %xmm0 5836; SSE-NEXT: movaps %xmm2, %xmm1 5837; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5838; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5839; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5840; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5841; SSE-NEXT: movaps 160(%rdi), %xmm2 5842; SSE-NEXT: movaps 160(%rsi), %xmm0 5843; SSE-NEXT: movaps %xmm2, %xmm1 5844; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5845; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5846; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5847; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5848; SSE-NEXT: movaps 160(%rdx), %xmm2 5849; SSE-NEXT: movaps 160(%rcx), %xmm0 5850; SSE-NEXT: movaps %xmm2, %xmm1 5851; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5852; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5853; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5854; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5855; SSE-NEXT: movaps 160(%r8), %xmm2 5856; SSE-NEXT: movaps 160(%r9), %xmm0 5857; SSE-NEXT: movaps %xmm2, %xmm1 5858; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5859; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5860; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5861; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5862; SSE-NEXT: movaps 160(%r10), %xmm2 5863; SSE-NEXT: movaps 160(%rax), %xmm0 5864; SSE-NEXT: movaps %xmm2, %xmm1 5865; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5866; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5867; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5868; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5869; SSE-NEXT: movaps 176(%rdi), %xmm2 5870; SSE-NEXT: movaps 176(%rsi), %xmm0 5871; SSE-NEXT: movaps %xmm2, %xmm1 5872; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5873; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5874; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5875; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5876; SSE-NEXT: movaps 176(%rdx), %xmm2 5877; SSE-NEXT: movaps 176(%rcx), %xmm0 5878; SSE-NEXT: movaps %xmm2, %xmm1 5879; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5880; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5881; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5882; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5883; SSE-NEXT: movaps 176(%r8), %xmm2 5884; SSE-NEXT: movaps 176(%r9), %xmm0 5885; SSE-NEXT: movaps %xmm2, %xmm1 5886; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5887; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5888; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5889; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5890; SSE-NEXT: movaps 176(%r10), %xmm2 5891; SSE-NEXT: movaps 176(%rax), %xmm0 5892; SSE-NEXT: movaps %xmm2, %xmm1 5893; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5894; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5895; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5896; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5897; SSE-NEXT: movaps 192(%rdi), %xmm2 5898; SSE-NEXT: movaps 192(%rsi), %xmm0 5899; SSE-NEXT: movaps %xmm2, %xmm1 5900; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5901; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5902; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5903; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5904; SSE-NEXT: movaps 192(%rdx), %xmm2 5905; SSE-NEXT: movaps 192(%rcx), %xmm0 5906; SSE-NEXT: movaps %xmm2, %xmm1 5907; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5908; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5909; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5910; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5911; SSE-NEXT: movaps 192(%r8), %xmm2 5912; SSE-NEXT: movaps 192(%r9), %xmm0 5913; SSE-NEXT: movaps %xmm2, %xmm1 5914; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5915; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5916; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5917; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5918; SSE-NEXT: movaps 192(%r10), %xmm2 5919; SSE-NEXT: movaps 192(%rax), %xmm0 5920; SSE-NEXT: movaps %xmm2, %xmm1 5921; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5922; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5923; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5924; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5925; SSE-NEXT: movaps 208(%rdi), %xmm2 5926; SSE-NEXT: movaps 208(%rsi), %xmm0 5927; SSE-NEXT: movaps %xmm2, %xmm1 5928; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5929; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill 5930; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5931; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5932; SSE-NEXT: movaps 208(%rdx), %xmm2 5933; SSE-NEXT: movaps 208(%rcx), %xmm0 5934; SSE-NEXT: movaps %xmm2, %xmm1 5935; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5936; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5937; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5938; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5939; SSE-NEXT: movaps 208(%r8), %xmm2 5940; SSE-NEXT: movaps 208(%r9), %xmm0 5941; SSE-NEXT: movaps %xmm2, %xmm1 5942; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5943; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5944; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5945; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5946; SSE-NEXT: movaps 208(%r10), %xmm2 5947; SSE-NEXT: movaps 208(%rax), %xmm0 5948; SSE-NEXT: movaps %xmm2, %xmm1 5949; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5950; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5951; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5952; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5953; SSE-NEXT: movaps 224(%rdi), %xmm14 5954; SSE-NEXT: movaps 224(%rsi), %xmm0 5955; SSE-NEXT: movaps %xmm14, %xmm1 5956; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5957; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5958; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] 5959; SSE-NEXT: movaps 224(%rdx), %xmm10 5960; SSE-NEXT: movaps 224(%rcx), %xmm0 5961; SSE-NEXT: movaps %xmm10, %xmm15 5962; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] 5963; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] 5964; SSE-NEXT: movaps 224(%r8), %xmm12 5965; SSE-NEXT: movaps 224(%r9), %xmm0 5966; SSE-NEXT: movaps %xmm12, %xmm13 5967; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] 5968; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] 5969; SSE-NEXT: movaps 224(%r10), %xmm8 5970; SSE-NEXT: movaps 224(%rax), %xmm0 5971; SSE-NEXT: movaps %xmm8, %xmm11 5972; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] 5973; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] 5974; SSE-NEXT: movaps 240(%rdi), %xmm5 5975; SSE-NEXT: movaps 240(%rsi), %xmm0 5976; SSE-NEXT: movaps %xmm5, %xmm9 5977; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] 5978; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 5979; SSE-NEXT: movaps 240(%rdx), %xmm6 5980; SSE-NEXT: movaps 240(%rcx), %xmm1 5981; SSE-NEXT: movaps %xmm6, %xmm7 5982; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] 5983; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] 5984; SSE-NEXT: movaps 240(%r8), %xmm1 5985; SSE-NEXT: movaps 240(%r9), %xmm2 5986; SSE-NEXT: movaps %xmm1, %xmm4 5987; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] 5988; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 5989; SSE-NEXT: movaps 240(%r10), %xmm2 5990; SSE-NEXT: movaps 240(%rax), %xmm3 5991; SSE-NEXT: movaps %xmm2, %xmm0 5992; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 5993; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] 5994; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 5995; SSE-NEXT: movaps %xmm2, 2032(%rax) 5996; SSE-NEXT: movaps %xmm1, 2016(%rax) 5997; SSE-NEXT: movaps %xmm6, 2000(%rax) 5998; SSE-NEXT: movaps %xmm5, 1984(%rax) 5999; SSE-NEXT: movaps %xmm0, 1968(%rax) 6000; SSE-NEXT: movaps %xmm4, 1952(%rax) 6001; SSE-NEXT: movaps %xmm7, 1936(%rax) 6002; SSE-NEXT: movaps %xmm9, 1920(%rax) 6003; SSE-NEXT: movaps %xmm8, 1904(%rax) 6004; SSE-NEXT: movaps %xmm12, 1888(%rax) 6005; SSE-NEXT: movaps %xmm10, 1872(%rax) 6006; SSE-NEXT: movaps %xmm14, 1856(%rax) 6007; SSE-NEXT: movaps %xmm11, 1840(%rax) 6008; SSE-NEXT: movaps %xmm13, 1824(%rax) 6009; SSE-NEXT: movaps %xmm15, 1808(%rax) 6010; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6011; SSE-NEXT: movaps %xmm0, 1792(%rax) 6012; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6013; SSE-NEXT: movaps %xmm0, 1776(%rax) 6014; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6015; SSE-NEXT: movaps %xmm0, 1760(%rax) 6016; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6017; SSE-NEXT: movaps %xmm0, 1744(%rax) 6018; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6019; SSE-NEXT: movaps %xmm0, 1728(%rax) 6020; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6021; SSE-NEXT: movaps %xmm0, 1712(%rax) 6022; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6023; SSE-NEXT: movaps %xmm0, 1696(%rax) 6024; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6025; SSE-NEXT: movaps %xmm0, 1680(%rax) 6026; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 6027; SSE-NEXT: movaps %xmm0, 1664(%rax) 6028; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6029; SSE-NEXT: movaps %xmm0, 1648(%rax) 6030; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6031; SSE-NEXT: movaps %xmm0, 1632(%rax) 6032; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6033; SSE-NEXT: movaps %xmm0, 1616(%rax) 6034; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6035; SSE-NEXT: movaps %xmm0, 1600(%rax) 6036; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6037; SSE-NEXT: movaps %xmm0, 1584(%rax) 6038; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6039; SSE-NEXT: movaps %xmm0, 1568(%rax) 6040; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6041; SSE-NEXT: movaps %xmm0, 1552(%rax) 6042; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6043; SSE-NEXT: movaps %xmm0, 1536(%rax) 6044; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6045; SSE-NEXT: movaps %xmm0, 1520(%rax) 6046; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6047; SSE-NEXT: movaps %xmm0, 1504(%rax) 6048; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6049; SSE-NEXT: movaps %xmm0, 1488(%rax) 6050; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6051; SSE-NEXT: movaps %xmm0, 1472(%rax) 6052; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6053; SSE-NEXT: movaps %xmm0, 1456(%rax) 6054; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6055; SSE-NEXT: movaps %xmm0, 1440(%rax) 6056; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6057; SSE-NEXT: movaps %xmm0, 1424(%rax) 6058; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6059; SSE-NEXT: movaps %xmm0, 1408(%rax) 6060; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6061; SSE-NEXT: movaps %xmm0, 1392(%rax) 6062; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6063; SSE-NEXT: movaps %xmm0, 1376(%rax) 6064; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6065; SSE-NEXT: movaps %xmm0, 1360(%rax) 6066; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6067; SSE-NEXT: movaps %xmm0, 1344(%rax) 6068; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6069; SSE-NEXT: movaps %xmm0, 1328(%rax) 6070; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6071; SSE-NEXT: movaps %xmm0, 1312(%rax) 6072; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6073; SSE-NEXT: movaps %xmm0, 1296(%rax) 6074; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6075; SSE-NEXT: movaps %xmm0, 1280(%rax) 6076; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6077; SSE-NEXT: movaps %xmm0, 1264(%rax) 6078; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6079; SSE-NEXT: movaps %xmm0, 1248(%rax) 6080; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6081; SSE-NEXT: movaps %xmm0, 1232(%rax) 6082; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6083; SSE-NEXT: movaps %xmm0, 1216(%rax) 6084; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6085; SSE-NEXT: movaps %xmm0, 1200(%rax) 6086; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6087; SSE-NEXT: movaps %xmm0, 1184(%rax) 6088; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6089; SSE-NEXT: movaps %xmm0, 1168(%rax) 6090; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6091; SSE-NEXT: movaps %xmm0, 1152(%rax) 6092; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6093; SSE-NEXT: movaps %xmm0, 1136(%rax) 6094; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6095; SSE-NEXT: movaps %xmm0, 1120(%rax) 6096; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6097; SSE-NEXT: movaps %xmm0, 1104(%rax) 6098; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6099; SSE-NEXT: movaps %xmm0, 1088(%rax) 6100; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6101; SSE-NEXT: movaps %xmm0, 1072(%rax) 6102; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6103; SSE-NEXT: movaps %xmm0, 1056(%rax) 6104; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6105; SSE-NEXT: movaps %xmm0, 1040(%rax) 6106; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6107; SSE-NEXT: movaps %xmm0, 1024(%rax) 6108; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6109; SSE-NEXT: movaps %xmm0, 1008(%rax) 6110; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6111; SSE-NEXT: movaps %xmm0, 992(%rax) 6112; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6113; SSE-NEXT: movaps %xmm0, 976(%rax) 6114; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6115; SSE-NEXT: movaps %xmm0, 960(%rax) 6116; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6117; SSE-NEXT: movaps %xmm0, 944(%rax) 6118; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6119; SSE-NEXT: movaps %xmm0, 928(%rax) 6120; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6121; SSE-NEXT: movaps %xmm0, 912(%rax) 6122; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6123; SSE-NEXT: movaps %xmm0, 896(%rax) 6124; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6125; SSE-NEXT: movaps %xmm0, 880(%rax) 6126; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6127; SSE-NEXT: movaps %xmm0, 864(%rax) 6128; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6129; SSE-NEXT: movaps %xmm0, 848(%rax) 6130; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6131; SSE-NEXT: movaps %xmm0, 832(%rax) 6132; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6133; SSE-NEXT: movaps %xmm0, 816(%rax) 6134; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6135; SSE-NEXT: movaps %xmm0, 800(%rax) 6136; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6137; SSE-NEXT: movaps %xmm0, 784(%rax) 6138; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6139; SSE-NEXT: movaps %xmm0, 768(%rax) 6140; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6141; SSE-NEXT: movaps %xmm0, 752(%rax) 6142; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6143; SSE-NEXT: movaps %xmm0, 736(%rax) 6144; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6145; SSE-NEXT: movaps %xmm0, 720(%rax) 6146; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6147; SSE-NEXT: movaps %xmm0, 704(%rax) 6148; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6149; SSE-NEXT: movaps %xmm0, 688(%rax) 6150; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6151; SSE-NEXT: movaps %xmm0, 672(%rax) 6152; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6153; SSE-NEXT: movaps %xmm0, 656(%rax) 6154; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6155; SSE-NEXT: movaps %xmm0, 640(%rax) 6156; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6157; SSE-NEXT: movaps %xmm0, 624(%rax) 6158; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6159; SSE-NEXT: movaps %xmm0, 608(%rax) 6160; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6161; SSE-NEXT: movaps %xmm0, 592(%rax) 6162; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6163; SSE-NEXT: movaps %xmm0, 576(%rax) 6164; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6165; SSE-NEXT: movaps %xmm0, 560(%rax) 6166; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6167; SSE-NEXT: movaps %xmm0, 544(%rax) 6168; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6169; SSE-NEXT: movaps %xmm0, 528(%rax) 6170; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6171; SSE-NEXT: movaps %xmm0, 512(%rax) 6172; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6173; SSE-NEXT: movaps %xmm0, 496(%rax) 6174; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6175; SSE-NEXT: movaps %xmm0, 480(%rax) 6176; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6177; SSE-NEXT: movaps %xmm0, 464(%rax) 6178; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6179; SSE-NEXT: movaps %xmm0, 448(%rax) 6180; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6181; SSE-NEXT: movaps %xmm0, 432(%rax) 6182; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6183; SSE-NEXT: movaps %xmm0, 416(%rax) 6184; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6185; SSE-NEXT: movaps %xmm0, 400(%rax) 6186; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6187; SSE-NEXT: movaps %xmm0, 384(%rax) 6188; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6189; SSE-NEXT: movaps %xmm0, 368(%rax) 6190; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6191; SSE-NEXT: movaps %xmm0, 352(%rax) 6192; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6193; SSE-NEXT: movaps %xmm0, 336(%rax) 6194; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6195; SSE-NEXT: movaps %xmm0, 320(%rax) 6196; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6197; SSE-NEXT: movaps %xmm0, 304(%rax) 6198; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6199; SSE-NEXT: movaps %xmm0, 288(%rax) 6200; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6201; SSE-NEXT: movaps %xmm0, 272(%rax) 6202; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6203; SSE-NEXT: movaps %xmm0, 256(%rax) 6204; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6205; SSE-NEXT: movaps %xmm0, 240(%rax) 6206; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6207; SSE-NEXT: movaps %xmm0, 224(%rax) 6208; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6209; SSE-NEXT: movaps %xmm0, 208(%rax) 6210; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6211; SSE-NEXT: movaps %xmm0, 192(%rax) 6212; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6213; SSE-NEXT: movaps %xmm0, 176(%rax) 6214; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6215; SSE-NEXT: movaps %xmm0, 160(%rax) 6216; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6217; SSE-NEXT: movaps %xmm0, 144(%rax) 6218; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6219; SSE-NEXT: movaps %xmm0, 128(%rax) 6220; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6221; SSE-NEXT: movaps %xmm0, 112(%rax) 6222; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6223; SSE-NEXT: movaps %xmm0, 96(%rax) 6224; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6225; SSE-NEXT: movaps %xmm0, 80(%rax) 6226; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6227; SSE-NEXT: movaps %xmm0, 64(%rax) 6228; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6229; SSE-NEXT: movaps %xmm0, 48(%rax) 6230; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6231; SSE-NEXT: movaps %xmm0, 32(%rax) 6232; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6233; SSE-NEXT: movaps %xmm0, 16(%rax) 6234; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6235; SSE-NEXT: movaps %xmm0, (%rax) 6236; SSE-NEXT: addq $1688, %rsp # imm = 0x698 6237; SSE-NEXT: retq 6238; 6239; AVX-LABEL: store_i64_stride8_vf32: 6240; AVX: # %bb.0: 6241; AVX-NEXT: subq $1672, %rsp # imm = 0x688 6242; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 6243; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 6244; AVX-NEXT: vmovaps (%rsi), %xmm2 6245; AVX-NEXT: vmovaps (%rdi), %xmm3 6246; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 6247; AVX-NEXT: vmovaps (%rcx), %xmm5 6248; AVX-NEXT: vmovaps 32(%rcx), %xmm1 6249; AVX-NEXT: vmovaps 64(%rcx), %xmm0 6250; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm6 6251; AVX-NEXT: vinsertf128 $1, (%rdx), %ymm4, %ymm4 6252; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[2] 6253; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6254; AVX-NEXT: vmovaps (%r9), %xmm4 6255; AVX-NEXT: vmovaps (%r8), %xmm6 6256; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm4[0] 6257; AVX-NEXT: vmovaps (%rax), %xmm8 6258; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm9 6259; AVX-NEXT: vinsertf128 $1, (%r10), %ymm7, %ymm7 6260; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[2] 6261; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6262; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] 6263; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm3 6264; AVX-NEXT: vbroadcastsd 8(%rdx), %ymm5 6265; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] 6266; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] 6267; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6268; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm4[1] 6269; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm3 6270; AVX-NEXT: vbroadcastsd 8(%r10), %ymm4 6271; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 6272; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] 6273; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6274; AVX-NEXT: vmovaps 32(%rsi), %xmm3 6275; AVX-NEXT: vmovaps 32(%rdi), %xmm4 6276; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm3[0] 6277; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm5 6278; AVX-NEXT: vinsertf128 $1, 32(%rdx), %ymm2, %ymm2 6279; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[2] 6280; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6281; AVX-NEXT: vmovaps 32(%r9), %xmm5 6282; AVX-NEXT: vmovaps 32(%r8), %xmm6 6283; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] 6284; AVX-NEXT: vmovaps 32(%rax), %xmm8 6285; AVX-NEXT: vmovaps 64(%rax), %xmm2 6286; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm9 6287; AVX-NEXT: vinsertf128 $1, 32(%r10), %ymm7, %ymm7 6288; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[2] 6289; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6290; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] 6291; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 6292; AVX-NEXT: vbroadcastsd 40(%rdx), %ymm4 6293; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] 6294; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] 6295; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6296; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] 6297; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm3 6298; AVX-NEXT: vbroadcastsd 40(%r10), %ymm4 6299; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] 6300; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] 6301; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6302; AVX-NEXT: vmovaps 64(%rsi), %xmm1 6303; AVX-NEXT: vmovaps 64(%rdi), %xmm3 6304; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm1[0] 6305; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm5 6306; AVX-NEXT: vinsertf128 $1, 64(%rdx), %ymm4, %ymm4 6307; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[2] 6308; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6309; AVX-NEXT: vmovaps 64(%r9), %xmm4 6310; AVX-NEXT: vmovaps 64(%r8), %xmm5 6311; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] 6312; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm7 6313; AVX-NEXT: vinsertf128 $1, 64(%r10), %ymm6, %ymm6 6314; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] 6315; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6316; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm1[1] 6317; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 6318; AVX-NEXT: vbroadcastsd 72(%rdx), %ymm3 6319; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 6320; AVX-NEXT: vmovaps 96(%rcx), %xmm3 6321; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] 6322; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6323; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] 6324; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 6325; AVX-NEXT: vbroadcastsd 72(%r10), %ymm2 6326; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 6327; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 6328; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6329; AVX-NEXT: vmovaps 96(%rsi), %xmm0 6330; AVX-NEXT: vmovaps 96(%rdi), %xmm1 6331; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 6332; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 6333; AVX-NEXT: vinsertf128 $1, 96(%rdx), %ymm2, %ymm2 6334; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] 6335; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6336; AVX-NEXT: vmovaps 96(%r9), %xmm2 6337; AVX-NEXT: vmovaps 96(%r8), %xmm4 6338; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] 6339; AVX-NEXT: vmovaps 96(%rax), %xmm6 6340; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 6341; AVX-NEXT: vinsertf128 $1, 96(%r10), %ymm5, %ymm5 6342; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] 6343; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6344; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 6345; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 6346; AVX-NEXT: vbroadcastsd 104(%rdx), %ymm3 6347; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 6348; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 6349; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6350; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] 6351; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 6352; AVX-NEXT: vbroadcastsd 104(%r10), %ymm2 6353; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 6354; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 6355; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6356; AVX-NEXT: vmovaps 128(%rsi), %xmm0 6357; AVX-NEXT: vmovaps 128(%rdi), %xmm1 6358; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 6359; AVX-NEXT: vmovaps 128(%rcx), %xmm3 6360; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 6361; AVX-NEXT: vinsertf128 $1, 128(%rdx), %ymm2, %ymm2 6362; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] 6363; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6364; AVX-NEXT: vmovaps 128(%r9), %xmm2 6365; AVX-NEXT: vmovaps 128(%r8), %xmm4 6366; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] 6367; AVX-NEXT: vmovaps 128(%rax), %xmm6 6368; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 6369; AVX-NEXT: vinsertf128 $1, 128(%r10), %ymm5, %ymm5 6370; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] 6371; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6372; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 6373; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 6374; AVX-NEXT: vbroadcastsd 136(%rdx), %ymm3 6375; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 6376; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 6377; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6378; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] 6379; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 6380; AVX-NEXT: vbroadcastsd 136(%r10), %ymm2 6381; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 6382; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 6383; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6384; AVX-NEXT: vmovaps 160(%rsi), %xmm0 6385; AVX-NEXT: vmovaps 160(%rdi), %xmm1 6386; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 6387; AVX-NEXT: vmovaps 160(%rcx), %xmm3 6388; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 6389; AVX-NEXT: vinsertf128 $1, 160(%rdx), %ymm2, %ymm2 6390; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] 6391; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6392; AVX-NEXT: vmovaps 160(%rax), %xmm2 6393; AVX-NEXT: vmovaps 160(%r9), %xmm4 6394; AVX-NEXT: vmovaps 160(%r8), %xmm5 6395; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] 6396; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm7 6397; AVX-NEXT: vinsertf128 $1, 160(%r10), %ymm6, %ymm6 6398; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] 6399; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6400; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 6401; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 6402; AVX-NEXT: vbroadcastsd 168(%rdx), %ymm3 6403; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 6404; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 6405; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6406; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] 6407; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 6408; AVX-NEXT: vbroadcastsd 168(%r10), %ymm2 6409; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 6410; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 6411; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6412; AVX-NEXT: vmovaps 192(%rsi), %xmm0 6413; AVX-NEXT: vmovaps 192(%rdi), %xmm1 6414; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 6415; AVX-NEXT: vmovaps 192(%rcx), %xmm3 6416; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 6417; AVX-NEXT: vinsertf128 $1, 192(%rdx), %ymm2, %ymm2 6418; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] 6419; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6420; AVX-NEXT: vmovaps 192(%rax), %xmm2 6421; AVX-NEXT: vmovaps 192(%r9), %xmm4 6422; AVX-NEXT: vmovaps 192(%r8), %xmm5 6423; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] 6424; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm7 6425; AVX-NEXT: vinsertf128 $1, 192(%r10), %ymm6, %ymm6 6426; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] 6427; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6428; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 6429; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 6430; AVX-NEXT: vbroadcastsd 200(%rdx), %ymm3 6431; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 6432; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 6433; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6434; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] 6435; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 6436; AVX-NEXT: vbroadcastsd 200(%r10), %ymm2 6437; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 6438; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 6439; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6440; AVX-NEXT: vmovaps 224(%rsi), %xmm0 6441; AVX-NEXT: vmovaps 224(%rdi), %xmm1 6442; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 6443; AVX-NEXT: vmovaps 224(%rcx), %xmm3 6444; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 6445; AVX-NEXT: vinsertf128 $1, 224(%rdx), %ymm2, %ymm2 6446; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] 6447; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6448; AVX-NEXT: vmovaps 224(%rax), %xmm2 6449; AVX-NEXT: vmovaps 224(%r9), %xmm4 6450; AVX-NEXT: vmovaps 224(%r8), %xmm5 6451; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] 6452; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm7 6453; AVX-NEXT: vinsertf128 $1, 224(%r10), %ymm6, %ymm6 6454; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] 6455; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6456; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 6457; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 6458; AVX-NEXT: vbroadcastsd 232(%rdx), %ymm3 6459; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 6460; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 6461; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6462; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] 6463; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 6464; AVX-NEXT: vbroadcastsd 232(%r10), %ymm2 6465; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 6466; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 6467; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6468; AVX-NEXT: vmovaps 16(%rsi), %xmm0 6469; AVX-NEXT: vmovaps 16(%rdi), %xmm1 6470; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 6471; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 6472; AVX-NEXT: vbroadcastsd 16(%rcx), %ymm3 6473; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 6474; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6475; AVX-NEXT: vmovaps 16(%r9), %xmm2 6476; AVX-NEXT: vmovaps 16(%r8), %xmm3 6477; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 6478; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 6479; AVX-NEXT: vbroadcastsd 16(%rax), %ymm5 6480; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 6481; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6482; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 6483; AVX-NEXT: vbroadcastsd 24(%rdx), %ymm1 6484; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6485; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 6486; AVX-NEXT: vbroadcastsd 24(%r10), %ymm1 6487; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6488; AVX-NEXT: vmovaps 48(%rsi), %xmm0 6489; AVX-NEXT: vmovaps 48(%rdi), %xmm1 6490; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 6491; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 6492; AVX-NEXT: vbroadcastsd 48(%rcx), %ymm3 6493; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 6494; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6495; AVX-NEXT: vmovaps 48(%r9), %xmm2 6496; AVX-NEXT: vmovaps 48(%r8), %xmm3 6497; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 6498; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 6499; AVX-NEXT: vbroadcastsd 48(%rax), %ymm5 6500; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 6501; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6502; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 6503; AVX-NEXT: vbroadcastsd 56(%rdx), %ymm1 6504; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6505; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6506; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 6507; AVX-NEXT: vbroadcastsd 56(%r10), %ymm1 6508; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6509; AVX-NEXT: vmovaps 80(%rsi), %xmm0 6510; AVX-NEXT: vmovaps 80(%rdi), %xmm1 6511; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 6512; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 6513; AVX-NEXT: vbroadcastsd 80(%rcx), %ymm3 6514; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 6515; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6516; AVX-NEXT: vmovaps 80(%r9), %xmm2 6517; AVX-NEXT: vmovaps 80(%r8), %xmm3 6518; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 6519; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 6520; AVX-NEXT: vbroadcastsd 80(%rax), %ymm5 6521; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 6522; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6523; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 6524; AVX-NEXT: vbroadcastsd 88(%rdx), %ymm1 6525; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6526; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6527; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 6528; AVX-NEXT: vbroadcastsd 88(%r10), %ymm1 6529; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6530; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6531; AVX-NEXT: vmovaps 112(%rsi), %xmm0 6532; AVX-NEXT: vmovaps 112(%rdi), %xmm1 6533; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 6534; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 6535; AVX-NEXT: vbroadcastsd 112(%rcx), %ymm3 6536; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 6537; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6538; AVX-NEXT: vmovaps 112(%r9), %xmm2 6539; AVX-NEXT: vmovaps 112(%r8), %xmm3 6540; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 6541; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 6542; AVX-NEXT: vbroadcastsd 112(%rax), %ymm5 6543; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 6544; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6545; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 6546; AVX-NEXT: vbroadcastsd 120(%rdx), %ymm1 6547; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6548; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 6549; AVX-NEXT: vbroadcastsd 120(%r10), %ymm1 6550; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6551; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6552; AVX-NEXT: vmovaps 144(%rsi), %xmm0 6553; AVX-NEXT: vmovaps 144(%rdi), %xmm1 6554; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 6555; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 6556; AVX-NEXT: vbroadcastsd 144(%rcx), %ymm3 6557; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 6558; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6559; AVX-NEXT: vmovaps 144(%r9), %xmm2 6560; AVX-NEXT: vmovaps 144(%r8), %xmm3 6561; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 6562; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 6563; AVX-NEXT: vbroadcastsd 144(%rax), %ymm5 6564; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 6565; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6566; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 6567; AVX-NEXT: vbroadcastsd 152(%rdx), %ymm1 6568; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6569; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6570; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 6571; AVX-NEXT: vbroadcastsd 152(%r10), %ymm1 6572; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6573; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6574; AVX-NEXT: vmovaps 176(%rsi), %xmm0 6575; AVX-NEXT: vmovaps 176(%rdi), %xmm1 6576; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 6577; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 6578; AVX-NEXT: vbroadcastsd 176(%rcx), %ymm3 6579; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 6580; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6581; AVX-NEXT: vmovaps 176(%r9), %xmm2 6582; AVX-NEXT: vmovaps 176(%r8), %xmm3 6583; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 6584; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 6585; AVX-NEXT: vbroadcastsd 176(%rax), %ymm5 6586; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 6587; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6588; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 6589; AVX-NEXT: vbroadcastsd 184(%rdx), %ymm1 6590; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6591; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6592; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 6593; AVX-NEXT: vbroadcastsd 184(%r10), %ymm1 6594; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6595; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6596; AVX-NEXT: vmovaps 208(%rsi), %xmm0 6597; AVX-NEXT: vmovaps 208(%rdi), %xmm1 6598; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 6599; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 6600; AVX-NEXT: vbroadcastsd 208(%rcx), %ymm3 6601; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 6602; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6603; AVX-NEXT: vmovaps 208(%r9), %xmm2 6604; AVX-NEXT: vmovaps 208(%r8), %xmm3 6605; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 6606; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 6607; AVX-NEXT: vbroadcastsd 208(%rax), %ymm5 6608; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 6609; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6610; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 6611; AVX-NEXT: vbroadcastsd 216(%rdx), %ymm1 6612; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6613; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 6614; AVX-NEXT: vbroadcastsd 216(%r10), %ymm1 6615; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6616; AVX-NEXT: vmovaps 240(%rsi), %xmm0 6617; AVX-NEXT: vmovaps 240(%rdi), %xmm3 6618; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm0[0] 6619; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] 6620; AVX-NEXT: vbroadcastsd 240(%rcx), %ymm5 6621; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm5[6,7] 6622; AVX-NEXT: vmovaps 240(%r9), %xmm5 6623; AVX-NEXT: vmovaps 240(%r8), %xmm6 6624; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm6[0],xmm5[0] 6625; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] 6626; AVX-NEXT: vbroadcastsd 240(%rax), %ymm14 6627; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5],ymm14[6,7] 6628; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1] 6629; AVX-NEXT: vbroadcastsd 248(%rdx), %ymm3 6630; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm3[4,5,6,7] 6631; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm5[1] 6632; AVX-NEXT: vbroadcastsd 248(%r10), %ymm3 6633; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] 6634; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdx 6635; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],mem[6,7] 6636; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6637; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],mem[6,7] 6638; AVX-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill 6639; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6640; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],mem[6,7] 6641; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5],mem[6,7] 6642; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6643; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],mem[6,7] 6644; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6645; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3,4,5],mem[6,7] 6646; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] 6647; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6648; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],mem[6,7] 6649; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6650; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],mem[6,7] 6651; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6652; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],mem[6,7] 6653; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6654; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],mem[6,7] 6655; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6656; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] 6657; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] 6658; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] 6659; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] 6660; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 6661; AVX-NEXT: vmovaps %ymm0, 2016(%rdx) 6662; AVX-NEXT: vmovaps %ymm1, 1984(%rdx) 6663; AVX-NEXT: vmovaps %ymm4, 1952(%rdx) 6664; AVX-NEXT: vmovaps %ymm7, 1920(%rdx) 6665; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6666; AVX-NEXT: vmovaps %ymm0, 1888(%rdx) 6667; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6668; AVX-NEXT: vmovaps %ymm0, 1856(%rdx) 6669; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6670; AVX-NEXT: vmovaps %ymm0, 1824(%rdx) 6671; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6672; AVX-NEXT: vmovaps %ymm0, 1792(%rdx) 6673; AVX-NEXT: vmovaps %ymm8, 1760(%rdx) 6674; AVX-NEXT: vmovaps %ymm10, 1728(%rdx) 6675; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6676; AVX-NEXT: vmovaps %ymm0, 1696(%rdx) 6677; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6678; AVX-NEXT: vmovaps %ymm0, 1664(%rdx) 6679; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6680; AVX-NEXT: vmovaps %ymm0, 1632(%rdx) 6681; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6682; AVX-NEXT: vmovaps %ymm0, 1600(%rdx) 6683; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6684; AVX-NEXT: vmovaps %ymm0, 1568(%rdx) 6685; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6686; AVX-NEXT: vmovaps %ymm0, 1536(%rdx) 6687; AVX-NEXT: vmovaps %ymm2, 1504(%rdx) 6688; AVX-NEXT: vmovaps %ymm3, 1472(%rdx) 6689; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6690; AVX-NEXT: vmovaps %ymm0, 1440(%rdx) 6691; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6692; AVX-NEXT: vmovaps %ymm0, 1408(%rdx) 6693; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6694; AVX-NEXT: vmovaps %ymm0, 1376(%rdx) 6695; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6696; AVX-NEXT: vmovaps %ymm0, 1344(%rdx) 6697; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6698; AVX-NEXT: vmovaps %ymm0, 1312(%rdx) 6699; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6700; AVX-NEXT: vmovaps %ymm0, 1280(%rdx) 6701; AVX-NEXT: vmovaps %ymm5, 1248(%rdx) 6702; AVX-NEXT: vmovaps %ymm6, 1216(%rdx) 6703; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6704; AVX-NEXT: vmovaps %ymm0, 1184(%rdx) 6705; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6706; AVX-NEXT: vmovaps %ymm0, 1152(%rdx) 6707; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6708; AVX-NEXT: vmovaps %ymm0, 1120(%rdx) 6709; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6710; AVX-NEXT: vmovaps %ymm0, 1088(%rdx) 6711; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6712; AVX-NEXT: vmovaps %ymm0, 1056(%rdx) 6713; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6714; AVX-NEXT: vmovaps %ymm0, 1024(%rdx) 6715; AVX-NEXT: vmovaps %ymm9, 992(%rdx) 6716; AVX-NEXT: vmovaps %ymm11, 960(%rdx) 6717; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6718; AVX-NEXT: vmovaps %ymm0, 928(%rdx) 6719; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6720; AVX-NEXT: vmovaps %ymm0, 896(%rdx) 6721; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6722; AVX-NEXT: vmovaps %ymm0, 864(%rdx) 6723; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6724; AVX-NEXT: vmovaps %ymm0, 832(%rdx) 6725; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6726; AVX-NEXT: vmovaps %ymm0, 800(%rdx) 6727; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6728; AVX-NEXT: vmovaps %ymm0, 768(%rdx) 6729; AVX-NEXT: vmovaps %ymm12, 736(%rdx) 6730; AVX-NEXT: vmovaps %ymm13, 704(%rdx) 6731; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6732; AVX-NEXT: vmovaps %ymm0, 672(%rdx) 6733; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6734; AVX-NEXT: vmovaps %ymm0, 640(%rdx) 6735; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6736; AVX-NEXT: vmovaps %ymm0, 608(%rdx) 6737; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6738; AVX-NEXT: vmovaps %ymm0, 576(%rdx) 6739; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6740; AVX-NEXT: vmovaps %ymm0, 544(%rdx) 6741; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6742; AVX-NEXT: vmovaps %ymm0, 512(%rdx) 6743; AVX-NEXT: vmovaps %ymm14, 480(%rdx) 6744; AVX-NEXT: vmovaps %ymm15, 448(%rdx) 6745; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6746; AVX-NEXT: vmovaps %ymm0, 416(%rdx) 6747; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6748; AVX-NEXT: vmovaps %ymm0, 384(%rdx) 6749; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6750; AVX-NEXT: vmovaps %ymm0, 352(%rdx) 6751; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6752; AVX-NEXT: vmovaps %ymm0, 320(%rdx) 6753; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6754; AVX-NEXT: vmovaps %ymm0, 288(%rdx) 6755; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6756; AVX-NEXT: vmovaps %ymm0, 256(%rdx) 6757; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 6758; AVX-NEXT: vmovaps %ymm0, 224(%rdx) 6759; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6760; AVX-NEXT: vmovaps %ymm0, 192(%rdx) 6761; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6762; AVX-NEXT: vmovaps %ymm0, 160(%rdx) 6763; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6764; AVX-NEXT: vmovaps %ymm0, 128(%rdx) 6765; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6766; AVX-NEXT: vmovaps %ymm0, 96(%rdx) 6767; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6768; AVX-NEXT: vmovaps %ymm0, 64(%rdx) 6769; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6770; AVX-NEXT: vmovaps %ymm0, 32(%rdx) 6771; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6772; AVX-NEXT: vmovaps %ymm0, (%rdx) 6773; AVX-NEXT: addq $1672, %rsp # imm = 0x688 6774; AVX-NEXT: vzeroupper 6775; AVX-NEXT: retq 6776; 6777; AVX2-LABEL: store_i64_stride8_vf32: 6778; AVX2: # %bb.0: 6779; AVX2-NEXT: subq $1704, %rsp # imm = 0x6A8 6780; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 6781; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 6782; AVX2-NEXT: vmovaps (%rcx), %xmm0 6783; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6784; AVX2-NEXT: vmovaps 32(%rcx), %xmm3 6785; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6786; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6787; AVX2-NEXT: vmovaps (%rsi), %xmm2 6788; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6789; AVX2-NEXT: vmovaps 32(%rsi), %xmm4 6790; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6791; AVX2-NEXT: vmovaps (%rdi), %xmm1 6792; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6793; AVX2-NEXT: vmovaps 32(%rdi), %xmm5 6794; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6795; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 6796; AVX2-NEXT: vbroadcastsd 8(%rdx), %ymm2 6797; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 6798; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 6799; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6800; AVX2-NEXT: vmovaps (%rax), %xmm0 6801; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6802; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6803; AVX2-NEXT: vmovaps (%r9), %xmm2 6804; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6805; AVX2-NEXT: vmovaps 32(%r9), %xmm6 6806; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6807; AVX2-NEXT: vmovaps (%r8), %xmm1 6808; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6809; AVX2-NEXT: vmovaps 32(%r8), %xmm7 6810; AVX2-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6811; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 6812; AVX2-NEXT: vbroadcastsd 8(%r10), %ymm2 6813; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 6814; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 6815; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6816; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 6817; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] 6818; AVX2-NEXT: vbroadcastsd 40(%rdx), %ymm2 6819; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 6820; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 6821; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6822; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] 6823; AVX2-NEXT: vbroadcastsd 40(%r10), %ymm1 6824; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6825; AVX2-NEXT: vmovaps 32(%rax), %xmm1 6826; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6827; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6828; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 6829; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6830; AVX2-NEXT: vmovaps 64(%rsi), %xmm1 6831; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6832; AVX2-NEXT: vmovaps 64(%rdi), %xmm0 6833; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6834; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 6835; AVX2-NEXT: vbroadcastsd 72(%rdx), %ymm1 6836; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6837; AVX2-NEXT: vmovaps 64(%rcx), %xmm1 6838; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6839; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6840; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 6841; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6842; AVX2-NEXT: vmovaps 64(%r9), %xmm1 6843; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6844; AVX2-NEXT: vmovaps 64(%r8), %xmm0 6845; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6846; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 6847; AVX2-NEXT: vbroadcastsd 72(%r10), %ymm1 6848; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6849; AVX2-NEXT: vmovaps 64(%rax), %xmm1 6850; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6851; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6852; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 6853; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6854; AVX2-NEXT: vmovaps 96(%rsi), %xmm1 6855; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6856; AVX2-NEXT: vmovaps 96(%rdi), %xmm0 6857; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6858; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 6859; AVX2-NEXT: vbroadcastsd 104(%rdx), %ymm1 6860; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6861; AVX2-NEXT: vmovaps 96(%rcx), %xmm1 6862; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6863; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6864; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 6865; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6866; AVX2-NEXT: vmovaps 96(%r9), %xmm1 6867; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6868; AVX2-NEXT: vmovaps 96(%r8), %xmm0 6869; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6870; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 6871; AVX2-NEXT: vbroadcastsd 104(%r10), %ymm1 6872; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6873; AVX2-NEXT: vmovaps 96(%rax), %xmm1 6874; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6875; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6876; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 6877; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6878; AVX2-NEXT: vmovaps 128(%rsi), %xmm1 6879; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6880; AVX2-NEXT: vmovaps 128(%rdi), %xmm0 6881; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6882; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 6883; AVX2-NEXT: vbroadcastsd 136(%rdx), %ymm1 6884; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6885; AVX2-NEXT: vmovaps 128(%rcx), %xmm1 6886; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6887; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6888; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 6889; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6890; AVX2-NEXT: vmovaps 128(%r9), %xmm1 6891; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6892; AVX2-NEXT: vmovaps 128(%r8), %xmm0 6893; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6894; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 6895; AVX2-NEXT: vbroadcastsd 136(%r10), %ymm1 6896; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6897; AVX2-NEXT: vmovaps 128(%rax), %xmm1 6898; AVX2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill 6899; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6900; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 6901; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6902; AVX2-NEXT: vmovaps 160(%rsi), %xmm1 6903; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6904; AVX2-NEXT: vmovaps 160(%rdi), %xmm0 6905; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6906; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 6907; AVX2-NEXT: vbroadcastsd 168(%rdx), %ymm1 6908; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6909; AVX2-NEXT: vmovaps 160(%rcx), %xmm1 6910; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6911; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6912; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 6913; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6914; AVX2-NEXT: vmovaps 160(%r9), %xmm0 6915; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6916; AVX2-NEXT: vmovaps 160(%r8), %xmm13 6917; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] 6918; AVX2-NEXT: vbroadcastsd 168(%r10), %ymm1 6919; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6920; AVX2-NEXT: vmovaps 160(%rax), %xmm12 6921; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 6922; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 6923; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6924; AVX2-NEXT: vmovaps 192(%rsi), %xmm11 6925; AVX2-NEXT: vmovaps 192(%rdi), %xmm10 6926; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] 6927; AVX2-NEXT: vbroadcastsd 200(%rdx), %ymm1 6928; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6929; AVX2-NEXT: vmovaps 192(%rcx), %xmm9 6930; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 6931; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 6932; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6933; AVX2-NEXT: vmovaps 192(%r9), %xmm8 6934; AVX2-NEXT: vmovaps 192(%r8), %xmm7 6935; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] 6936; AVX2-NEXT: vbroadcastsd 200(%r10), %ymm1 6937; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6938; AVX2-NEXT: vmovaps 192(%rax), %xmm6 6939; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 6940; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 6941; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6942; AVX2-NEXT: vmovaps 224(%rsi), %xmm5 6943; AVX2-NEXT: vmovaps 224(%rdi), %xmm4 6944; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] 6945; AVX2-NEXT: vbroadcastsd 232(%rdx), %ymm1 6946; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6947; AVX2-NEXT: vmovaps 224(%rcx), %xmm3 6948; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 6949; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 6950; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6951; AVX2-NEXT: vmovaps 224(%r9), %xmm2 6952; AVX2-NEXT: vmovaps 224(%r8), %xmm1 6953; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] 6954; AVX2-NEXT: vbroadcastsd 232(%r10), %ymm15 6955; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] 6956; AVX2-NEXT: vmovaps 224(%rax), %xmm0 6957; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 6958; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] 6959; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6960; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6961; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 6962; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 6963; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm14, %ymm14 6964; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 6965; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 6966; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6967; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6968; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 6969; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 6970; AVX2-NEXT: vinsertf128 $1, (%r10), %ymm14, %ymm14 6971; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 6972; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 6973; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6974; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6975; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 6976; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 6977; AVX2-NEXT: vinsertf128 $1, 32(%rdx), %ymm14, %ymm14 6978; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 6979; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 6980; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6981; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6982; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 6983; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 6984; AVX2-NEXT: vinsertf128 $1, 32(%r10), %ymm14, %ymm14 6985; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 6986; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 6987; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6988; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6989; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 6990; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 6991; AVX2-NEXT: vinsertf128 $1, 64(%rdx), %ymm14, %ymm14 6992; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 6993; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 6994; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6995; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6996; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 6997; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 6998; AVX2-NEXT: vinsertf128 $1, 64(%r10), %ymm14, %ymm14 6999; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 7000; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 7001; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7002; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7003; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 7004; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 7005; AVX2-NEXT: vinsertf128 $1, 96(%rdx), %ymm14, %ymm14 7006; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 7007; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 7008; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7009; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7010; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 7011; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 7012; AVX2-NEXT: vinsertf128 $1, 96(%r10), %ymm14, %ymm14 7013; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 7014; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 7015; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7016; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7017; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 7018; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 7019; AVX2-NEXT: vinsertf128 $1, 128(%rdx), %ymm14, %ymm14 7020; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 7021; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 7022; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7023; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7024; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 7025; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 7026; AVX2-NEXT: vinsertf128 $1, 128(%r10), %ymm14, %ymm14 7027; AVX2-NEXT: vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload 7028; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 7029; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7030; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7031; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 7032; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 7033; AVX2-NEXT: vinsertf128 $1, 160(%rdx), %ymm14, %ymm14 7034; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 7035; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 7036; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7037; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 7038; AVX2-NEXT: # xmm13 = xmm13[0],mem[0] 7039; AVX2-NEXT: vinsertf128 $1, 160(%r10), %ymm13, %ymm13 7040; AVX2-NEXT: vbroadcastsd %xmm12, %ymm12 7041; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 7042; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7043; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] 7044; AVX2-NEXT: vinsertf128 $1, 192(%rdx), %ymm10, %ymm10 7045; AVX2-NEXT: vbroadcastsd %xmm9, %ymm9 7046; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] 7047; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7048; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] 7049; AVX2-NEXT: vinsertf128 $1, 192(%r10), %ymm7, %ymm7 7050; AVX2-NEXT: vbroadcastsd %xmm6, %ymm6 7051; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] 7052; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7053; AVX2-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 7054; AVX2-NEXT: vinsertf128 $1, 224(%rdx), %ymm4, %ymm4 7055; AVX2-NEXT: vbroadcastsd %xmm3, %ymm3 7056; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] 7057; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7058; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 7059; AVX2-NEXT: vinsertf128 $1, 224(%r10), %ymm1, %ymm1 7060; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 7061; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 7062; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7063; AVX2-NEXT: vmovaps (%rdi), %ymm0 7064; AVX2-NEXT: vmovaps (%rsi), %ymm1 7065; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 7066; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 7067; AVX2-NEXT: vbroadcastsd 16(%rcx), %ymm3 7068; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 7069; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7070; AVX2-NEXT: vmovaps (%r8), %ymm2 7071; AVX2-NEXT: vmovaps (%r9), %ymm3 7072; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 7073; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 7074; AVX2-NEXT: vbroadcastsd 16(%rax), %ymm5 7075; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 7076; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7077; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 7078; AVX2-NEXT: vbroadcastsd 24(%rdx), %ymm1 7079; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],ymm1[2,3] 7080; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 7081; AVX2-NEXT: vbroadcastsd 24(%r10), %ymm1 7082; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] 7083; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 7084; AVX2-NEXT: vmovaps 32(%rsi), %ymm1 7085; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 7086; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 7087; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm3 7088; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 7089; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7090; AVX2-NEXT: vmovaps 32(%r8), %ymm2 7091; AVX2-NEXT: vmovaps 32(%r9), %ymm3 7092; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 7093; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 7094; AVX2-NEXT: vbroadcastsd 48(%rax), %ymm5 7095; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 7096; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7097; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 7098; AVX2-NEXT: vbroadcastsd 56(%rdx), %ymm1 7099; AVX2-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] 7100; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 7101; AVX2-NEXT: vbroadcastsd 56(%r10), %ymm1 7102; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7103; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7104; AVX2-NEXT: vmovaps 64(%rdi), %ymm0 7105; AVX2-NEXT: vmovaps 64(%rsi), %ymm1 7106; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 7107; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 7108; AVX2-NEXT: vbroadcastsd 80(%rcx), %ymm3 7109; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 7110; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7111; AVX2-NEXT: vmovaps 64(%r8), %ymm2 7112; AVX2-NEXT: vmovaps 64(%r9), %ymm3 7113; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 7114; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 7115; AVX2-NEXT: vbroadcastsd 80(%rax), %ymm5 7116; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 7117; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7118; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 7119; AVX2-NEXT: vbroadcastsd 88(%rdx), %ymm1 7120; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7121; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7122; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 7123; AVX2-NEXT: vbroadcastsd 88(%r10), %ymm1 7124; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7125; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7126; AVX2-NEXT: vmovaps 96(%rdi), %ymm0 7127; AVX2-NEXT: vmovaps 96(%rsi), %ymm1 7128; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 7129; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 7130; AVX2-NEXT: vbroadcastsd 112(%rcx), %ymm3 7131; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 7132; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7133; AVX2-NEXT: vmovaps 96(%r8), %ymm2 7134; AVX2-NEXT: vmovaps 96(%r9), %ymm3 7135; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 7136; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 7137; AVX2-NEXT: vbroadcastsd 112(%rax), %ymm5 7138; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 7139; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7140; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 7141; AVX2-NEXT: vbroadcastsd 120(%rdx), %ymm1 7142; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7143; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 7144; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 7145; AVX2-NEXT: vbroadcastsd 120(%r10), %ymm1 7146; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7147; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7148; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 7149; AVX2-NEXT: vmovaps 128(%rsi), %ymm1 7150; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 7151; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 7152; AVX2-NEXT: vbroadcastsd 144(%rcx), %ymm3 7153; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 7154; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7155; AVX2-NEXT: vmovaps 128(%r8), %ymm2 7156; AVX2-NEXT: vmovaps 128(%r9), %ymm3 7157; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 7158; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 7159; AVX2-NEXT: vbroadcastsd 144(%rax), %ymm5 7160; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 7161; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7162; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 7163; AVX2-NEXT: vbroadcastsd 152(%rdx), %ymm1 7164; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7165; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7166; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 7167; AVX2-NEXT: vbroadcastsd 152(%r10), %ymm1 7168; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7169; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7170; AVX2-NEXT: vmovaps 160(%rdi), %ymm0 7171; AVX2-NEXT: vmovaps 160(%rsi), %ymm1 7172; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 7173; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 7174; AVX2-NEXT: vbroadcastsd 176(%rcx), %ymm3 7175; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 7176; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7177; AVX2-NEXT: vmovaps 160(%r8), %ymm2 7178; AVX2-NEXT: vmovaps 160(%r9), %ymm3 7179; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 7180; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 7181; AVX2-NEXT: vbroadcastsd 176(%rax), %ymm5 7182; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 7183; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7184; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 7185; AVX2-NEXT: vbroadcastsd 184(%rdx), %ymm1 7186; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7187; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7188; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 7189; AVX2-NEXT: vbroadcastsd 184(%r10), %ymm1 7190; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7191; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7192; AVX2-NEXT: vmovaps 192(%rdi), %ymm0 7193; AVX2-NEXT: vmovaps 192(%rsi), %ymm1 7194; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 7195; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 7196; AVX2-NEXT: vbroadcastsd 208(%rcx), %ymm3 7197; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 7198; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7199; AVX2-NEXT: vmovaps 192(%r8), %ymm2 7200; AVX2-NEXT: vmovaps 192(%r9), %ymm3 7201; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 7202; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 7203; AVX2-NEXT: vbroadcastsd 208(%rax), %ymm5 7204; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 7205; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7206; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 7207; AVX2-NEXT: vbroadcastsd 216(%rdx), %ymm1 7208; AVX2-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3] 7209; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 7210; AVX2-NEXT: vbroadcastsd 216(%r10), %ymm1 7211; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] 7212; AVX2-NEXT: vmovaps 224(%rdi), %ymm0 7213; AVX2-NEXT: vmovaps 224(%rsi), %ymm3 7214; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] 7215; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] 7216; AVX2-NEXT: vbroadcastsd 240(%rcx), %ymm5 7217; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5],ymm5[6,7] 7218; AVX2-NEXT: vmovaps 224(%r8), %ymm5 7219; AVX2-NEXT: vmovaps 224(%r9), %ymm6 7220; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] 7221; AVX2-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],mem[2,3] 7222; AVX2-NEXT: vbroadcastsd 240(%rax), %ymm15 7223; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1,2,3,4,5],ymm15[6,7] 7224; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] 7225; AVX2-NEXT: vbroadcastsd 248(%rdx), %ymm3 7226; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm3[2,3] 7227; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] 7228; AVX2-NEXT: vbroadcastsd 248(%r10), %ymm3 7229; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm3[2,3] 7230; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx 7231; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],mem[6,7] 7232; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7233; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],mem[6,7] 7234; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7235; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5],mem[6,7] 7236; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7237; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],mem[6,7] 7238; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7239; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],mem[6,7] 7240; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7241; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],mem[6,7] 7242; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 7243; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] 7244; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7245; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] 7246; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7247; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],mem[6,7] 7248; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7249; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] 7250; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7251; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] 7252; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7253; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 7254; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] 7255; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] 7256; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] 7257; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] 7258; AVX2-NEXT: vmovaps %ymm2, 2016(%rdx) 7259; AVX2-NEXT: vmovaps %ymm4, 1984(%rdx) 7260; AVX2-NEXT: vmovaps %ymm15, 1952(%rdx) 7261; AVX2-NEXT: vmovaps %ymm10, 1920(%rdx) 7262; AVX2-NEXT: vmovaps %ymm11, 1760(%rdx) 7263; AVX2-NEXT: vmovaps %ymm13, 1728(%rdx) 7264; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7265; AVX2-NEXT: vmovaps %ymm2, 1696(%rdx) 7266; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7267; AVX2-NEXT: vmovaps %ymm2, 1664(%rdx) 7268; AVX2-NEXT: vmovaps %ymm0, 1504(%rdx) 7269; AVX2-NEXT: vmovaps %ymm1, 1472(%rdx) 7270; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7271; AVX2-NEXT: vmovaps %ymm0, 1440(%rdx) 7272; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7273; AVX2-NEXT: vmovaps %ymm0, 1408(%rdx) 7274; AVX2-NEXT: vmovaps %ymm3, 1248(%rdx) 7275; AVX2-NEXT: vmovaps %ymm5, 1216(%rdx) 7276; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7277; AVX2-NEXT: vmovaps %ymm0, 1184(%rdx) 7278; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7279; AVX2-NEXT: vmovaps %ymm0, 1152(%rdx) 7280; AVX2-NEXT: vmovaps %ymm6, 992(%rdx) 7281; AVX2-NEXT: vmovaps %ymm7, 960(%rdx) 7282; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7283; AVX2-NEXT: vmovaps %ymm0, 928(%rdx) 7284; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7285; AVX2-NEXT: vmovaps %ymm0, 896(%rdx) 7286; AVX2-NEXT: vmovaps %ymm8, 736(%rdx) 7287; AVX2-NEXT: vmovaps %ymm9, 704(%rdx) 7288; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7289; AVX2-NEXT: vmovaps %ymm0, 672(%rdx) 7290; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7291; AVX2-NEXT: vmovaps %ymm0, 640(%rdx) 7292; AVX2-NEXT: vmovaps %ymm12, 480(%rdx) 7293; AVX2-NEXT: vmovaps %ymm14, 448(%rdx) 7294; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7295; AVX2-NEXT: vmovaps %ymm0, 416(%rdx) 7296; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7297; AVX2-NEXT: vmovaps %ymm0, 384(%rdx) 7298; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7299; AVX2-NEXT: vmovaps %ymm0, 224(%rdx) 7300; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7301; AVX2-NEXT: vmovaps %ymm0, 192(%rdx) 7302; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7303; AVX2-NEXT: vmovaps %ymm0, 160(%rdx) 7304; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7305; AVX2-NEXT: vmovaps %ymm0, 128(%rdx) 7306; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7307; AVX2-NEXT: vmovaps %ymm0, 1888(%rdx) 7308; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7309; AVX2-NEXT: vmovaps %ymm0, 1856(%rdx) 7310; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7311; AVX2-NEXT: vmovaps %ymm0, 1824(%rdx) 7312; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7313; AVX2-NEXT: vmovaps %ymm0, 1792(%rdx) 7314; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7315; AVX2-NEXT: vmovaps %ymm0, 1632(%rdx) 7316; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7317; AVX2-NEXT: vmovaps %ymm0, 1600(%rdx) 7318; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7319; AVX2-NEXT: vmovaps %ymm0, 1568(%rdx) 7320; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7321; AVX2-NEXT: vmovaps %ymm0, 1536(%rdx) 7322; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7323; AVX2-NEXT: vmovaps %ymm0, 1376(%rdx) 7324; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7325; AVX2-NEXT: vmovaps %ymm0, 1344(%rdx) 7326; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7327; AVX2-NEXT: vmovaps %ymm0, 1312(%rdx) 7328; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7329; AVX2-NEXT: vmovaps %ymm0, 1280(%rdx) 7330; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7331; AVX2-NEXT: vmovaps %ymm0, 1120(%rdx) 7332; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7333; AVX2-NEXT: vmovaps %ymm0, 1088(%rdx) 7334; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7335; AVX2-NEXT: vmovaps %ymm0, 1056(%rdx) 7336; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7337; AVX2-NEXT: vmovaps %ymm0, 1024(%rdx) 7338; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7339; AVX2-NEXT: vmovaps %ymm0, 864(%rdx) 7340; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7341; AVX2-NEXT: vmovaps %ymm0, 832(%rdx) 7342; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7343; AVX2-NEXT: vmovaps %ymm0, 800(%rdx) 7344; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7345; AVX2-NEXT: vmovaps %ymm0, 768(%rdx) 7346; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7347; AVX2-NEXT: vmovaps %ymm0, 608(%rdx) 7348; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7349; AVX2-NEXT: vmovaps %ymm0, 576(%rdx) 7350; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7351; AVX2-NEXT: vmovaps %ymm0, 544(%rdx) 7352; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7353; AVX2-NEXT: vmovaps %ymm0, 512(%rdx) 7354; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7355; AVX2-NEXT: vmovaps %ymm0, 352(%rdx) 7356; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7357; AVX2-NEXT: vmovaps %ymm0, 320(%rdx) 7358; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7359; AVX2-NEXT: vmovaps %ymm0, 288(%rdx) 7360; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7361; AVX2-NEXT: vmovaps %ymm0, 256(%rdx) 7362; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7363; AVX2-NEXT: vmovaps %ymm0, 96(%rdx) 7364; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7365; AVX2-NEXT: vmovaps %ymm0, 64(%rdx) 7366; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7367; AVX2-NEXT: vmovaps %ymm0, 32(%rdx) 7368; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7369; AVX2-NEXT: vmovaps %ymm0, (%rdx) 7370; AVX2-NEXT: addq $1704, %rsp # imm = 0x6A8 7371; AVX2-NEXT: vzeroupper 7372; AVX2-NEXT: retq 7373; 7374; AVX2-FP-LABEL: store_i64_stride8_vf32: 7375; AVX2-FP: # %bb.0: 7376; AVX2-FP-NEXT: subq $1704, %rsp # imm = 0x6A8 7377; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 7378; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 7379; AVX2-FP-NEXT: vmovaps (%rcx), %xmm0 7380; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7381; AVX2-FP-NEXT: vmovaps 32(%rcx), %xmm3 7382; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7383; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 7384; AVX2-FP-NEXT: vmovaps (%rsi), %xmm2 7385; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7386; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm4 7387; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7388; AVX2-FP-NEXT: vmovaps (%rdi), %xmm1 7389; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7390; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm5 7391; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7392; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 7393; AVX2-FP-NEXT: vbroadcastsd 8(%rdx), %ymm2 7394; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 7395; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 7396; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7397; AVX2-FP-NEXT: vmovaps (%rax), %xmm0 7398; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7399; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 7400; AVX2-FP-NEXT: vmovaps (%r9), %xmm2 7401; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7402; AVX2-FP-NEXT: vmovaps 32(%r9), %xmm6 7403; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7404; AVX2-FP-NEXT: vmovaps (%r8), %xmm1 7405; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7406; AVX2-FP-NEXT: vmovaps 32(%r8), %xmm7 7407; AVX2-FP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7408; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 7409; AVX2-FP-NEXT: vbroadcastsd 8(%r10), %ymm2 7410; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 7411; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 7412; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7413; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 7414; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] 7415; AVX2-FP-NEXT: vbroadcastsd 40(%rdx), %ymm2 7416; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 7417; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 7418; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7419; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] 7420; AVX2-FP-NEXT: vbroadcastsd 40(%r10), %ymm1 7421; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7422; AVX2-FP-NEXT: vmovaps 32(%rax), %xmm1 7423; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7424; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 7425; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 7426; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7427; AVX2-FP-NEXT: vmovaps 64(%rsi), %xmm1 7428; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7429; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm0 7430; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7431; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 7432; AVX2-FP-NEXT: vbroadcastsd 72(%rdx), %ymm1 7433; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7434; AVX2-FP-NEXT: vmovaps 64(%rcx), %xmm1 7435; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7436; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 7437; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 7438; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7439; AVX2-FP-NEXT: vmovaps 64(%r9), %xmm1 7440; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7441; AVX2-FP-NEXT: vmovaps 64(%r8), %xmm0 7442; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7443; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 7444; AVX2-FP-NEXT: vbroadcastsd 72(%r10), %ymm1 7445; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7446; AVX2-FP-NEXT: vmovaps 64(%rax), %xmm1 7447; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7448; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 7449; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 7450; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7451; AVX2-FP-NEXT: vmovaps 96(%rsi), %xmm1 7452; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7453; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm0 7454; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7455; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 7456; AVX2-FP-NEXT: vbroadcastsd 104(%rdx), %ymm1 7457; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7458; AVX2-FP-NEXT: vmovaps 96(%rcx), %xmm1 7459; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7460; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 7461; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 7462; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7463; AVX2-FP-NEXT: vmovaps 96(%r9), %xmm1 7464; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7465; AVX2-FP-NEXT: vmovaps 96(%r8), %xmm0 7466; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7467; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 7468; AVX2-FP-NEXT: vbroadcastsd 104(%r10), %ymm1 7469; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7470; AVX2-FP-NEXT: vmovaps 96(%rax), %xmm1 7471; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7472; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 7473; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 7474; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7475; AVX2-FP-NEXT: vmovaps 128(%rsi), %xmm1 7476; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7477; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm0 7478; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7479; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 7480; AVX2-FP-NEXT: vbroadcastsd 136(%rdx), %ymm1 7481; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7482; AVX2-FP-NEXT: vmovaps 128(%rcx), %xmm1 7483; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7484; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 7485; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 7486; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7487; AVX2-FP-NEXT: vmovaps 128(%r9), %xmm1 7488; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7489; AVX2-FP-NEXT: vmovaps 128(%r8), %xmm0 7490; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7491; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 7492; AVX2-FP-NEXT: vbroadcastsd 136(%r10), %ymm1 7493; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7494; AVX2-FP-NEXT: vmovaps 128(%rax), %xmm1 7495; AVX2-FP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill 7496; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 7497; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 7498; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7499; AVX2-FP-NEXT: vmovaps 160(%rsi), %xmm1 7500; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7501; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm0 7502; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7503; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 7504; AVX2-FP-NEXT: vbroadcastsd 168(%rdx), %ymm1 7505; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7506; AVX2-FP-NEXT: vmovaps 160(%rcx), %xmm1 7507; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7508; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 7509; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 7510; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7511; AVX2-FP-NEXT: vmovaps 160(%r9), %xmm0 7512; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7513; AVX2-FP-NEXT: vmovaps 160(%r8), %xmm13 7514; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] 7515; AVX2-FP-NEXT: vbroadcastsd 168(%r10), %ymm1 7516; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7517; AVX2-FP-NEXT: vmovaps 160(%rax), %xmm12 7518; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 7519; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 7520; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7521; AVX2-FP-NEXT: vmovaps 192(%rsi), %xmm11 7522; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm10 7523; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] 7524; AVX2-FP-NEXT: vbroadcastsd 200(%rdx), %ymm1 7525; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7526; AVX2-FP-NEXT: vmovaps 192(%rcx), %xmm9 7527; AVX2-FP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 7528; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 7529; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7530; AVX2-FP-NEXT: vmovaps 192(%r9), %xmm8 7531; AVX2-FP-NEXT: vmovaps 192(%r8), %xmm7 7532; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] 7533; AVX2-FP-NEXT: vbroadcastsd 200(%r10), %ymm1 7534; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7535; AVX2-FP-NEXT: vmovaps 192(%rax), %xmm6 7536; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 7537; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 7538; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7539; AVX2-FP-NEXT: vmovaps 224(%rsi), %xmm5 7540; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm4 7541; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] 7542; AVX2-FP-NEXT: vbroadcastsd 232(%rdx), %ymm1 7543; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7544; AVX2-FP-NEXT: vmovaps 224(%rcx), %xmm3 7545; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 7546; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 7547; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7548; AVX2-FP-NEXT: vmovaps 224(%r9), %xmm2 7549; AVX2-FP-NEXT: vmovaps 224(%r8), %xmm1 7550; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] 7551; AVX2-FP-NEXT: vbroadcastsd 232(%r10), %ymm15 7552; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] 7553; AVX2-FP-NEXT: vmovaps 224(%rax), %xmm0 7554; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 7555; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] 7556; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7557; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7558; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 7559; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 7560; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm14, %ymm14 7561; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 7562; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 7563; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7564; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7565; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 7566; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 7567; AVX2-FP-NEXT: vinsertf128 $1, (%r10), %ymm14, %ymm14 7568; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 7569; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 7570; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7571; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7572; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 7573; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 7574; AVX2-FP-NEXT: vinsertf128 $1, 32(%rdx), %ymm14, %ymm14 7575; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 7576; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 7577; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7578; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7579; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 7580; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 7581; AVX2-FP-NEXT: vinsertf128 $1, 32(%r10), %ymm14, %ymm14 7582; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 7583; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 7584; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7585; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7586; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 7587; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 7588; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdx), %ymm14, %ymm14 7589; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 7590; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 7591; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7592; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7593; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 7594; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 7595; AVX2-FP-NEXT: vinsertf128 $1, 64(%r10), %ymm14, %ymm14 7596; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 7597; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 7598; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7599; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7600; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 7601; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 7602; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdx), %ymm14, %ymm14 7603; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 7604; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 7605; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7606; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7607; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 7608; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 7609; AVX2-FP-NEXT: vinsertf128 $1, 96(%r10), %ymm14, %ymm14 7610; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 7611; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 7612; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7613; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7614; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 7615; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 7616; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdx), %ymm14, %ymm14 7617; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 7618; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 7619; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7620; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7621; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 7622; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 7623; AVX2-FP-NEXT: vinsertf128 $1, 128(%r10), %ymm14, %ymm14 7624; AVX2-FP-NEXT: vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload 7625; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 7626; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7627; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7628; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 7629; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 7630; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdx), %ymm14, %ymm14 7631; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 7632; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 7633; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7634; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 7635; AVX2-FP-NEXT: # xmm13 = xmm13[0],mem[0] 7636; AVX2-FP-NEXT: vinsertf128 $1, 160(%r10), %ymm13, %ymm13 7637; AVX2-FP-NEXT: vbroadcastsd %xmm12, %ymm12 7638; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 7639; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7640; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] 7641; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdx), %ymm10, %ymm10 7642; AVX2-FP-NEXT: vbroadcastsd %xmm9, %ymm9 7643; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] 7644; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7645; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] 7646; AVX2-FP-NEXT: vinsertf128 $1, 192(%r10), %ymm7, %ymm7 7647; AVX2-FP-NEXT: vbroadcastsd %xmm6, %ymm6 7648; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] 7649; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7650; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 7651; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdx), %ymm4, %ymm4 7652; AVX2-FP-NEXT: vbroadcastsd %xmm3, %ymm3 7653; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] 7654; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7655; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 7656; AVX2-FP-NEXT: vinsertf128 $1, 224(%r10), %ymm1, %ymm1 7657; AVX2-FP-NEXT: vbroadcastsd %xmm0, %ymm0 7658; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 7659; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7660; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 7661; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1 7662; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 7663; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 7664; AVX2-FP-NEXT: vbroadcastsd 16(%rcx), %ymm3 7665; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 7666; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7667; AVX2-FP-NEXT: vmovaps (%r8), %ymm2 7668; AVX2-FP-NEXT: vmovaps (%r9), %ymm3 7669; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 7670; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 7671; AVX2-FP-NEXT: vbroadcastsd 16(%rax), %ymm5 7672; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 7673; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7674; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 7675; AVX2-FP-NEXT: vbroadcastsd 24(%rdx), %ymm1 7676; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],ymm1[2,3] 7677; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 7678; AVX2-FP-NEXT: vbroadcastsd 24(%r10), %ymm1 7679; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] 7680; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0 7681; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm1 7682; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 7683; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 7684; AVX2-FP-NEXT: vbroadcastsd 48(%rcx), %ymm3 7685; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 7686; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7687; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm2 7688; AVX2-FP-NEXT: vmovaps 32(%r9), %ymm3 7689; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 7690; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 7691; AVX2-FP-NEXT: vbroadcastsd 48(%rax), %ymm5 7692; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 7693; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7694; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 7695; AVX2-FP-NEXT: vbroadcastsd 56(%rdx), %ymm1 7696; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] 7697; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 7698; AVX2-FP-NEXT: vbroadcastsd 56(%r10), %ymm1 7699; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7700; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7701; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm0 7702; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm1 7703; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 7704; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 7705; AVX2-FP-NEXT: vbroadcastsd 80(%rcx), %ymm3 7706; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 7707; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7708; AVX2-FP-NEXT: vmovaps 64(%r8), %ymm2 7709; AVX2-FP-NEXT: vmovaps 64(%r9), %ymm3 7710; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 7711; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 7712; AVX2-FP-NEXT: vbroadcastsd 80(%rax), %ymm5 7713; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 7714; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7715; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 7716; AVX2-FP-NEXT: vbroadcastsd 88(%rdx), %ymm1 7717; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7718; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7719; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 7720; AVX2-FP-NEXT: vbroadcastsd 88(%r10), %ymm1 7721; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7722; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7723; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm0 7724; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm1 7725; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 7726; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 7727; AVX2-FP-NEXT: vbroadcastsd 112(%rcx), %ymm3 7728; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 7729; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7730; AVX2-FP-NEXT: vmovaps 96(%r8), %ymm2 7731; AVX2-FP-NEXT: vmovaps 96(%r9), %ymm3 7732; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 7733; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 7734; AVX2-FP-NEXT: vbroadcastsd 112(%rax), %ymm5 7735; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 7736; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7737; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 7738; AVX2-FP-NEXT: vbroadcastsd 120(%rdx), %ymm1 7739; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7740; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 7741; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 7742; AVX2-FP-NEXT: vbroadcastsd 120(%r10), %ymm1 7743; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7744; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7745; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0 7746; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm1 7747; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 7748; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 7749; AVX2-FP-NEXT: vbroadcastsd 144(%rcx), %ymm3 7750; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 7751; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7752; AVX2-FP-NEXT: vmovaps 128(%r8), %ymm2 7753; AVX2-FP-NEXT: vmovaps 128(%r9), %ymm3 7754; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 7755; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 7756; AVX2-FP-NEXT: vbroadcastsd 144(%rax), %ymm5 7757; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 7758; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7759; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 7760; AVX2-FP-NEXT: vbroadcastsd 152(%rdx), %ymm1 7761; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7762; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7763; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 7764; AVX2-FP-NEXT: vbroadcastsd 152(%r10), %ymm1 7765; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7766; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7767; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm0 7768; AVX2-FP-NEXT: vmovaps 160(%rsi), %ymm1 7769; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 7770; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 7771; AVX2-FP-NEXT: vbroadcastsd 176(%rcx), %ymm3 7772; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 7773; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7774; AVX2-FP-NEXT: vmovaps 160(%r8), %ymm2 7775; AVX2-FP-NEXT: vmovaps 160(%r9), %ymm3 7776; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 7777; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 7778; AVX2-FP-NEXT: vbroadcastsd 176(%rax), %ymm5 7779; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 7780; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7781; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 7782; AVX2-FP-NEXT: vbroadcastsd 184(%rdx), %ymm1 7783; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7784; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7785; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 7786; AVX2-FP-NEXT: vbroadcastsd 184(%r10), %ymm1 7787; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7788; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7789; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm0 7790; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm1 7791; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 7792; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 7793; AVX2-FP-NEXT: vbroadcastsd 208(%rcx), %ymm3 7794; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 7795; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7796; AVX2-FP-NEXT: vmovaps 192(%r8), %ymm2 7797; AVX2-FP-NEXT: vmovaps 192(%r9), %ymm3 7798; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 7799; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 7800; AVX2-FP-NEXT: vbroadcastsd 208(%rax), %ymm5 7801; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 7802; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7803; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 7804; AVX2-FP-NEXT: vbroadcastsd 216(%rdx), %ymm1 7805; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3] 7806; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 7807; AVX2-FP-NEXT: vbroadcastsd 216(%r10), %ymm1 7808; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] 7809; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm0 7810; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm3 7811; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] 7812; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] 7813; AVX2-FP-NEXT: vbroadcastsd 240(%rcx), %ymm5 7814; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5],ymm5[6,7] 7815; AVX2-FP-NEXT: vmovaps 224(%r8), %ymm5 7816; AVX2-FP-NEXT: vmovaps 224(%r9), %ymm6 7817; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] 7818; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],mem[2,3] 7819; AVX2-FP-NEXT: vbroadcastsd 240(%rax), %ymm15 7820; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1,2,3,4,5],ymm15[6,7] 7821; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] 7822; AVX2-FP-NEXT: vbroadcastsd 248(%rdx), %ymm3 7823; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm3[2,3] 7824; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] 7825; AVX2-FP-NEXT: vbroadcastsd 248(%r10), %ymm3 7826; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm3[2,3] 7827; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rdx 7828; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],mem[6,7] 7829; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7830; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],mem[6,7] 7831; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7832; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5],mem[6,7] 7833; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7834; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],mem[6,7] 7835; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7836; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],mem[6,7] 7837; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7838; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],mem[6,7] 7839; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 7840; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] 7841; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7842; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] 7843; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7844; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],mem[6,7] 7845; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7846; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] 7847; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7848; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] 7849; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7850; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 7851; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] 7852; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] 7853; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] 7854; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] 7855; AVX2-FP-NEXT: vmovaps %ymm2, 2016(%rdx) 7856; AVX2-FP-NEXT: vmovaps %ymm4, 1984(%rdx) 7857; AVX2-FP-NEXT: vmovaps %ymm15, 1952(%rdx) 7858; AVX2-FP-NEXT: vmovaps %ymm10, 1920(%rdx) 7859; AVX2-FP-NEXT: vmovaps %ymm11, 1760(%rdx) 7860; AVX2-FP-NEXT: vmovaps %ymm13, 1728(%rdx) 7861; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7862; AVX2-FP-NEXT: vmovaps %ymm2, 1696(%rdx) 7863; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7864; AVX2-FP-NEXT: vmovaps %ymm2, 1664(%rdx) 7865; AVX2-FP-NEXT: vmovaps %ymm0, 1504(%rdx) 7866; AVX2-FP-NEXT: vmovaps %ymm1, 1472(%rdx) 7867; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7868; AVX2-FP-NEXT: vmovaps %ymm0, 1440(%rdx) 7869; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7870; AVX2-FP-NEXT: vmovaps %ymm0, 1408(%rdx) 7871; AVX2-FP-NEXT: vmovaps %ymm3, 1248(%rdx) 7872; AVX2-FP-NEXT: vmovaps %ymm5, 1216(%rdx) 7873; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7874; AVX2-FP-NEXT: vmovaps %ymm0, 1184(%rdx) 7875; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7876; AVX2-FP-NEXT: vmovaps %ymm0, 1152(%rdx) 7877; AVX2-FP-NEXT: vmovaps %ymm6, 992(%rdx) 7878; AVX2-FP-NEXT: vmovaps %ymm7, 960(%rdx) 7879; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7880; AVX2-FP-NEXT: vmovaps %ymm0, 928(%rdx) 7881; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7882; AVX2-FP-NEXT: vmovaps %ymm0, 896(%rdx) 7883; AVX2-FP-NEXT: vmovaps %ymm8, 736(%rdx) 7884; AVX2-FP-NEXT: vmovaps %ymm9, 704(%rdx) 7885; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7886; AVX2-FP-NEXT: vmovaps %ymm0, 672(%rdx) 7887; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7888; AVX2-FP-NEXT: vmovaps %ymm0, 640(%rdx) 7889; AVX2-FP-NEXT: vmovaps %ymm12, 480(%rdx) 7890; AVX2-FP-NEXT: vmovaps %ymm14, 448(%rdx) 7891; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7892; AVX2-FP-NEXT: vmovaps %ymm0, 416(%rdx) 7893; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7894; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rdx) 7895; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7896; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rdx) 7897; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7898; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rdx) 7899; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7900; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rdx) 7901; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7902; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rdx) 7903; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7904; AVX2-FP-NEXT: vmovaps %ymm0, 1888(%rdx) 7905; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7906; AVX2-FP-NEXT: vmovaps %ymm0, 1856(%rdx) 7907; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7908; AVX2-FP-NEXT: vmovaps %ymm0, 1824(%rdx) 7909; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7910; AVX2-FP-NEXT: vmovaps %ymm0, 1792(%rdx) 7911; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7912; AVX2-FP-NEXT: vmovaps %ymm0, 1632(%rdx) 7913; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7914; AVX2-FP-NEXT: vmovaps %ymm0, 1600(%rdx) 7915; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7916; AVX2-FP-NEXT: vmovaps %ymm0, 1568(%rdx) 7917; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7918; AVX2-FP-NEXT: vmovaps %ymm0, 1536(%rdx) 7919; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7920; AVX2-FP-NEXT: vmovaps %ymm0, 1376(%rdx) 7921; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7922; AVX2-FP-NEXT: vmovaps %ymm0, 1344(%rdx) 7923; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7924; AVX2-FP-NEXT: vmovaps %ymm0, 1312(%rdx) 7925; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7926; AVX2-FP-NEXT: vmovaps %ymm0, 1280(%rdx) 7927; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7928; AVX2-FP-NEXT: vmovaps %ymm0, 1120(%rdx) 7929; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7930; AVX2-FP-NEXT: vmovaps %ymm0, 1088(%rdx) 7931; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7932; AVX2-FP-NEXT: vmovaps %ymm0, 1056(%rdx) 7933; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7934; AVX2-FP-NEXT: vmovaps %ymm0, 1024(%rdx) 7935; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7936; AVX2-FP-NEXT: vmovaps %ymm0, 864(%rdx) 7937; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7938; AVX2-FP-NEXT: vmovaps %ymm0, 832(%rdx) 7939; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7940; AVX2-FP-NEXT: vmovaps %ymm0, 800(%rdx) 7941; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7942; AVX2-FP-NEXT: vmovaps %ymm0, 768(%rdx) 7943; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7944; AVX2-FP-NEXT: vmovaps %ymm0, 608(%rdx) 7945; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7946; AVX2-FP-NEXT: vmovaps %ymm0, 576(%rdx) 7947; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7948; AVX2-FP-NEXT: vmovaps %ymm0, 544(%rdx) 7949; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7950; AVX2-FP-NEXT: vmovaps %ymm0, 512(%rdx) 7951; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7952; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rdx) 7953; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7954; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rdx) 7955; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7956; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rdx) 7957; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7958; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rdx) 7959; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7960; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rdx) 7961; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7962; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rdx) 7963; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7964; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rdx) 7965; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7966; AVX2-FP-NEXT: vmovaps %ymm0, (%rdx) 7967; AVX2-FP-NEXT: addq $1704, %rsp # imm = 0x6A8 7968; AVX2-FP-NEXT: vzeroupper 7969; AVX2-FP-NEXT: retq 7970; 7971; AVX2-FCP-LABEL: store_i64_stride8_vf32: 7972; AVX2-FCP: # %bb.0: 7973; AVX2-FCP-NEXT: subq $1704, %rsp # imm = 0x6A8 7974; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 7975; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 7976; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm0 7977; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7978; AVX2-FCP-NEXT: vmovaps 32(%rcx), %xmm3 7979; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7980; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 7981; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm2 7982; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7983; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm4 7984; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7985; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm1 7986; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7987; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm5 7988; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7989; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 7990; AVX2-FCP-NEXT: vbroadcastsd 8(%rdx), %ymm2 7991; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 7992; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 7993; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7994; AVX2-FCP-NEXT: vmovaps (%rax), %xmm0 7995; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7996; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 7997; AVX2-FCP-NEXT: vmovaps (%r9), %xmm2 7998; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7999; AVX2-FCP-NEXT: vmovaps 32(%r9), %xmm6 8000; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8001; AVX2-FCP-NEXT: vmovaps (%r8), %xmm1 8002; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8003; AVX2-FCP-NEXT: vmovaps 32(%r8), %xmm7 8004; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8005; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 8006; AVX2-FCP-NEXT: vbroadcastsd 8(%r10), %ymm2 8007; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 8008; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 8009; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8010; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 8011; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] 8012; AVX2-FCP-NEXT: vbroadcastsd 40(%rdx), %ymm2 8013; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 8014; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 8015; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8016; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] 8017; AVX2-FCP-NEXT: vbroadcastsd 40(%r10), %ymm1 8018; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8019; AVX2-FCP-NEXT: vmovaps 32(%rax), %xmm1 8020; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8021; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 8022; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8023; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8024; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm1 8025; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8026; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm0 8027; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8028; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 8029; AVX2-FCP-NEXT: vbroadcastsd 72(%rdx), %ymm1 8030; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8031; AVX2-FCP-NEXT: vmovaps 64(%rcx), %xmm1 8032; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8033; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 8034; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8035; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8036; AVX2-FCP-NEXT: vmovaps 64(%r9), %xmm1 8037; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8038; AVX2-FCP-NEXT: vmovaps 64(%r8), %xmm0 8039; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8040; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 8041; AVX2-FCP-NEXT: vbroadcastsd 72(%r10), %ymm1 8042; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8043; AVX2-FCP-NEXT: vmovaps 64(%rax), %xmm1 8044; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8045; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 8046; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8047; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8048; AVX2-FCP-NEXT: vmovaps 96(%rsi), %xmm1 8049; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8050; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm0 8051; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8052; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 8053; AVX2-FCP-NEXT: vbroadcastsd 104(%rdx), %ymm1 8054; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8055; AVX2-FCP-NEXT: vmovaps 96(%rcx), %xmm1 8056; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8057; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 8058; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8059; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8060; AVX2-FCP-NEXT: vmovaps 96(%r9), %xmm1 8061; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8062; AVX2-FCP-NEXT: vmovaps 96(%r8), %xmm0 8063; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8064; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 8065; AVX2-FCP-NEXT: vbroadcastsd 104(%r10), %ymm1 8066; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8067; AVX2-FCP-NEXT: vmovaps 96(%rax), %xmm1 8068; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8069; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 8070; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8071; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8072; AVX2-FCP-NEXT: vmovaps 128(%rsi), %xmm1 8073; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8074; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm0 8075; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8076; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 8077; AVX2-FCP-NEXT: vbroadcastsd 136(%rdx), %ymm1 8078; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8079; AVX2-FCP-NEXT: vmovaps 128(%rcx), %xmm1 8080; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8081; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 8082; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8083; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8084; AVX2-FCP-NEXT: vmovaps 128(%r9), %xmm1 8085; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8086; AVX2-FCP-NEXT: vmovaps 128(%r8), %xmm0 8087; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8088; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 8089; AVX2-FCP-NEXT: vbroadcastsd 136(%r10), %ymm1 8090; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8091; AVX2-FCP-NEXT: vmovaps 128(%rax), %xmm1 8092; AVX2-FCP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill 8093; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 8094; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8095; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8096; AVX2-FCP-NEXT: vmovaps 160(%rsi), %xmm1 8097; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8098; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm0 8099; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8100; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 8101; AVX2-FCP-NEXT: vbroadcastsd 168(%rdx), %ymm1 8102; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8103; AVX2-FCP-NEXT: vmovaps 160(%rcx), %xmm1 8104; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8105; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 8106; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8107; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8108; AVX2-FCP-NEXT: vmovaps 160(%r9), %xmm0 8109; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8110; AVX2-FCP-NEXT: vmovaps 160(%r8), %xmm13 8111; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] 8112; AVX2-FCP-NEXT: vbroadcastsd 168(%r10), %ymm1 8113; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8114; AVX2-FCP-NEXT: vmovaps 160(%rax), %xmm12 8115; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 8116; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8117; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8118; AVX2-FCP-NEXT: vmovaps 192(%rsi), %xmm11 8119; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm10 8120; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] 8121; AVX2-FCP-NEXT: vbroadcastsd 200(%rdx), %ymm1 8122; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8123; AVX2-FCP-NEXT: vmovaps 192(%rcx), %xmm9 8124; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 8125; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8126; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8127; AVX2-FCP-NEXT: vmovaps 192(%r9), %xmm8 8128; AVX2-FCP-NEXT: vmovaps 192(%r8), %xmm7 8129; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] 8130; AVX2-FCP-NEXT: vbroadcastsd 200(%r10), %ymm1 8131; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8132; AVX2-FCP-NEXT: vmovaps 192(%rax), %xmm6 8133; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 8134; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8135; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8136; AVX2-FCP-NEXT: vmovaps 224(%rsi), %xmm5 8137; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm4 8138; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] 8139; AVX2-FCP-NEXT: vbroadcastsd 232(%rdx), %ymm1 8140; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8141; AVX2-FCP-NEXT: vmovaps 224(%rcx), %xmm3 8142; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 8143; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8144; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8145; AVX2-FCP-NEXT: vmovaps 224(%r9), %xmm2 8146; AVX2-FCP-NEXT: vmovaps 224(%r8), %xmm1 8147; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] 8148; AVX2-FCP-NEXT: vbroadcastsd 232(%r10), %ymm15 8149; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] 8150; AVX2-FCP-NEXT: vmovaps 224(%rax), %xmm0 8151; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 8152; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] 8153; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8154; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 8155; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 8156; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 8157; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm14, %ymm14 8158; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 8159; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 8160; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8161; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 8162; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 8163; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 8164; AVX2-FCP-NEXT: vinsertf128 $1, (%r10), %ymm14, %ymm14 8165; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 8166; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 8167; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8168; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 8169; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 8170; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 8171; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rdx), %ymm14, %ymm14 8172; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 8173; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 8174; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8175; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 8176; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 8177; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 8178; AVX2-FCP-NEXT: vinsertf128 $1, 32(%r10), %ymm14, %ymm14 8179; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 8180; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 8181; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8182; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 8183; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 8184; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 8185; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdx), %ymm14, %ymm14 8186; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 8187; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 8188; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8189; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 8190; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 8191; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 8192; AVX2-FCP-NEXT: vinsertf128 $1, 64(%r10), %ymm14, %ymm14 8193; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 8194; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 8195; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8196; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 8197; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 8198; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 8199; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdx), %ymm14, %ymm14 8200; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 8201; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 8202; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8203; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 8204; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 8205; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 8206; AVX2-FCP-NEXT: vinsertf128 $1, 96(%r10), %ymm14, %ymm14 8207; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 8208; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 8209; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8210; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 8211; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 8212; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 8213; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdx), %ymm14, %ymm14 8214; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 8215; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 8216; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8217; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 8218; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 8219; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 8220; AVX2-FCP-NEXT: vinsertf128 $1, 128(%r10), %ymm14, %ymm14 8221; AVX2-FCP-NEXT: vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload 8222; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 8223; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8224; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 8225; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 8226; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 8227; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdx), %ymm14, %ymm14 8228; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 8229; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 8230; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8231; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 8232; AVX2-FCP-NEXT: # xmm13 = xmm13[0],mem[0] 8233; AVX2-FCP-NEXT: vinsertf128 $1, 160(%r10), %ymm13, %ymm13 8234; AVX2-FCP-NEXT: vbroadcastsd %xmm12, %ymm12 8235; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 8236; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8237; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] 8238; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdx), %ymm10, %ymm10 8239; AVX2-FCP-NEXT: vbroadcastsd %xmm9, %ymm9 8240; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] 8241; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8242; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] 8243; AVX2-FCP-NEXT: vinsertf128 $1, 192(%r10), %ymm7, %ymm7 8244; AVX2-FCP-NEXT: vbroadcastsd %xmm6, %ymm6 8245; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] 8246; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8247; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 8248; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdx), %ymm4, %ymm4 8249; AVX2-FCP-NEXT: vbroadcastsd %xmm3, %ymm3 8250; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] 8251; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8252; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 8253; AVX2-FCP-NEXT: vinsertf128 $1, 224(%r10), %ymm1, %ymm1 8254; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0 8255; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 8256; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8257; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 8258; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1 8259; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 8260; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 8261; AVX2-FCP-NEXT: vbroadcastsd 16(%rcx), %ymm3 8262; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 8263; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8264; AVX2-FCP-NEXT: vmovaps (%r8), %ymm2 8265; AVX2-FCP-NEXT: vmovaps (%r9), %ymm3 8266; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 8267; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 8268; AVX2-FCP-NEXT: vbroadcastsd 16(%rax), %ymm5 8269; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 8270; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8271; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 8272; AVX2-FCP-NEXT: vbroadcastsd 24(%rdx), %ymm1 8273; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],ymm1[2,3] 8274; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 8275; AVX2-FCP-NEXT: vbroadcastsd 24(%r10), %ymm1 8276; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] 8277; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0 8278; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm1 8279; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 8280; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 8281; AVX2-FCP-NEXT: vbroadcastsd 48(%rcx), %ymm3 8282; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 8283; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8284; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm2 8285; AVX2-FCP-NEXT: vmovaps 32(%r9), %ymm3 8286; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 8287; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 8288; AVX2-FCP-NEXT: vbroadcastsd 48(%rax), %ymm5 8289; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 8290; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8291; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 8292; AVX2-FCP-NEXT: vbroadcastsd 56(%rdx), %ymm1 8293; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] 8294; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 8295; AVX2-FCP-NEXT: vbroadcastsd 56(%r10), %ymm1 8296; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 8297; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8298; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm0 8299; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm1 8300; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 8301; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 8302; AVX2-FCP-NEXT: vbroadcastsd 80(%rcx), %ymm3 8303; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 8304; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8305; AVX2-FCP-NEXT: vmovaps 64(%r8), %ymm2 8306; AVX2-FCP-NEXT: vmovaps 64(%r9), %ymm3 8307; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 8308; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 8309; AVX2-FCP-NEXT: vbroadcastsd 80(%rax), %ymm5 8310; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 8311; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8312; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 8313; AVX2-FCP-NEXT: vbroadcastsd 88(%rdx), %ymm1 8314; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 8315; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8316; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 8317; AVX2-FCP-NEXT: vbroadcastsd 88(%r10), %ymm1 8318; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 8319; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8320; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm0 8321; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm1 8322; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 8323; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 8324; AVX2-FCP-NEXT: vbroadcastsd 112(%rcx), %ymm3 8325; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 8326; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8327; AVX2-FCP-NEXT: vmovaps 96(%r8), %ymm2 8328; AVX2-FCP-NEXT: vmovaps 96(%r9), %ymm3 8329; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 8330; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 8331; AVX2-FCP-NEXT: vbroadcastsd 112(%rax), %ymm5 8332; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 8333; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8334; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 8335; AVX2-FCP-NEXT: vbroadcastsd 120(%rdx), %ymm1 8336; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 8337; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 8338; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 8339; AVX2-FCP-NEXT: vbroadcastsd 120(%r10), %ymm1 8340; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 8341; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8342; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm0 8343; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm1 8344; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 8345; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 8346; AVX2-FCP-NEXT: vbroadcastsd 144(%rcx), %ymm3 8347; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 8348; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8349; AVX2-FCP-NEXT: vmovaps 128(%r8), %ymm2 8350; AVX2-FCP-NEXT: vmovaps 128(%r9), %ymm3 8351; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 8352; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 8353; AVX2-FCP-NEXT: vbroadcastsd 144(%rax), %ymm5 8354; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 8355; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8356; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 8357; AVX2-FCP-NEXT: vbroadcastsd 152(%rdx), %ymm1 8358; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 8359; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8360; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 8361; AVX2-FCP-NEXT: vbroadcastsd 152(%r10), %ymm1 8362; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 8363; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8364; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm0 8365; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm1 8366; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 8367; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 8368; AVX2-FCP-NEXT: vbroadcastsd 176(%rcx), %ymm3 8369; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 8370; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8371; AVX2-FCP-NEXT: vmovaps 160(%r8), %ymm2 8372; AVX2-FCP-NEXT: vmovaps 160(%r9), %ymm3 8373; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 8374; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 8375; AVX2-FCP-NEXT: vbroadcastsd 176(%rax), %ymm5 8376; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 8377; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8378; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 8379; AVX2-FCP-NEXT: vbroadcastsd 184(%rdx), %ymm1 8380; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 8381; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8382; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 8383; AVX2-FCP-NEXT: vbroadcastsd 184(%r10), %ymm1 8384; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 8385; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8386; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm0 8387; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm1 8388; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 8389; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 8390; AVX2-FCP-NEXT: vbroadcastsd 208(%rcx), %ymm3 8391; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 8392; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8393; AVX2-FCP-NEXT: vmovaps 192(%r8), %ymm2 8394; AVX2-FCP-NEXT: vmovaps 192(%r9), %ymm3 8395; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 8396; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 8397; AVX2-FCP-NEXT: vbroadcastsd 208(%rax), %ymm5 8398; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 8399; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8400; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 8401; AVX2-FCP-NEXT: vbroadcastsd 216(%rdx), %ymm1 8402; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3] 8403; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 8404; AVX2-FCP-NEXT: vbroadcastsd 216(%r10), %ymm1 8405; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] 8406; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm0 8407; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm3 8408; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] 8409; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] 8410; AVX2-FCP-NEXT: vbroadcastsd 240(%rcx), %ymm5 8411; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5],ymm5[6,7] 8412; AVX2-FCP-NEXT: vmovaps 224(%r8), %ymm5 8413; AVX2-FCP-NEXT: vmovaps 224(%r9), %ymm6 8414; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] 8415; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],mem[2,3] 8416; AVX2-FCP-NEXT: vbroadcastsd 240(%rax), %ymm15 8417; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1,2,3,4,5],ymm15[6,7] 8418; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] 8419; AVX2-FCP-NEXT: vbroadcastsd 248(%rdx), %ymm3 8420; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm3[2,3] 8421; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] 8422; AVX2-FCP-NEXT: vbroadcastsd 248(%r10), %ymm3 8423; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm3[2,3] 8424; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdx 8425; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],mem[6,7] 8426; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8427; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],mem[6,7] 8428; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8429; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5],mem[6,7] 8430; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8431; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],mem[6,7] 8432; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8433; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],mem[6,7] 8434; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8435; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],mem[6,7] 8436; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 8437; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] 8438; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8439; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] 8440; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8441; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],mem[6,7] 8442; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8443; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] 8444; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8445; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] 8446; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8447; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 8448; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] 8449; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] 8450; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] 8451; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] 8452; AVX2-FCP-NEXT: vmovaps %ymm2, 2016(%rdx) 8453; AVX2-FCP-NEXT: vmovaps %ymm4, 1984(%rdx) 8454; AVX2-FCP-NEXT: vmovaps %ymm15, 1952(%rdx) 8455; AVX2-FCP-NEXT: vmovaps %ymm10, 1920(%rdx) 8456; AVX2-FCP-NEXT: vmovaps %ymm11, 1760(%rdx) 8457; AVX2-FCP-NEXT: vmovaps %ymm13, 1728(%rdx) 8458; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 8459; AVX2-FCP-NEXT: vmovaps %ymm2, 1696(%rdx) 8460; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 8461; AVX2-FCP-NEXT: vmovaps %ymm2, 1664(%rdx) 8462; AVX2-FCP-NEXT: vmovaps %ymm0, 1504(%rdx) 8463; AVX2-FCP-NEXT: vmovaps %ymm1, 1472(%rdx) 8464; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8465; AVX2-FCP-NEXT: vmovaps %ymm0, 1440(%rdx) 8466; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8467; AVX2-FCP-NEXT: vmovaps %ymm0, 1408(%rdx) 8468; AVX2-FCP-NEXT: vmovaps %ymm3, 1248(%rdx) 8469; AVX2-FCP-NEXT: vmovaps %ymm5, 1216(%rdx) 8470; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8471; AVX2-FCP-NEXT: vmovaps %ymm0, 1184(%rdx) 8472; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8473; AVX2-FCP-NEXT: vmovaps %ymm0, 1152(%rdx) 8474; AVX2-FCP-NEXT: vmovaps %ymm6, 992(%rdx) 8475; AVX2-FCP-NEXT: vmovaps %ymm7, 960(%rdx) 8476; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8477; AVX2-FCP-NEXT: vmovaps %ymm0, 928(%rdx) 8478; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8479; AVX2-FCP-NEXT: vmovaps %ymm0, 896(%rdx) 8480; AVX2-FCP-NEXT: vmovaps %ymm8, 736(%rdx) 8481; AVX2-FCP-NEXT: vmovaps %ymm9, 704(%rdx) 8482; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8483; AVX2-FCP-NEXT: vmovaps %ymm0, 672(%rdx) 8484; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8485; AVX2-FCP-NEXT: vmovaps %ymm0, 640(%rdx) 8486; AVX2-FCP-NEXT: vmovaps %ymm12, 480(%rdx) 8487; AVX2-FCP-NEXT: vmovaps %ymm14, 448(%rdx) 8488; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8489; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%rdx) 8490; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8491; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rdx) 8492; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8493; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rdx) 8494; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8495; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rdx) 8496; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8497; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rdx) 8498; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8499; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rdx) 8500; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8501; AVX2-FCP-NEXT: vmovaps %ymm0, 1888(%rdx) 8502; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8503; AVX2-FCP-NEXT: vmovaps %ymm0, 1856(%rdx) 8504; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8505; AVX2-FCP-NEXT: vmovaps %ymm0, 1824(%rdx) 8506; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8507; AVX2-FCP-NEXT: vmovaps %ymm0, 1792(%rdx) 8508; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8509; AVX2-FCP-NEXT: vmovaps %ymm0, 1632(%rdx) 8510; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8511; AVX2-FCP-NEXT: vmovaps %ymm0, 1600(%rdx) 8512; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8513; AVX2-FCP-NEXT: vmovaps %ymm0, 1568(%rdx) 8514; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8515; AVX2-FCP-NEXT: vmovaps %ymm0, 1536(%rdx) 8516; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8517; AVX2-FCP-NEXT: vmovaps %ymm0, 1376(%rdx) 8518; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8519; AVX2-FCP-NEXT: vmovaps %ymm0, 1344(%rdx) 8520; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8521; AVX2-FCP-NEXT: vmovaps %ymm0, 1312(%rdx) 8522; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8523; AVX2-FCP-NEXT: vmovaps %ymm0, 1280(%rdx) 8524; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8525; AVX2-FCP-NEXT: vmovaps %ymm0, 1120(%rdx) 8526; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8527; AVX2-FCP-NEXT: vmovaps %ymm0, 1088(%rdx) 8528; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8529; AVX2-FCP-NEXT: vmovaps %ymm0, 1056(%rdx) 8530; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8531; AVX2-FCP-NEXT: vmovaps %ymm0, 1024(%rdx) 8532; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8533; AVX2-FCP-NEXT: vmovaps %ymm0, 864(%rdx) 8534; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8535; AVX2-FCP-NEXT: vmovaps %ymm0, 832(%rdx) 8536; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8537; AVX2-FCP-NEXT: vmovaps %ymm0, 800(%rdx) 8538; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8539; AVX2-FCP-NEXT: vmovaps %ymm0, 768(%rdx) 8540; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8541; AVX2-FCP-NEXT: vmovaps %ymm0, 608(%rdx) 8542; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8543; AVX2-FCP-NEXT: vmovaps %ymm0, 576(%rdx) 8544; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8545; AVX2-FCP-NEXT: vmovaps %ymm0, 544(%rdx) 8546; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8547; AVX2-FCP-NEXT: vmovaps %ymm0, 512(%rdx) 8548; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8549; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%rdx) 8550; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8551; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rdx) 8552; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8553; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rdx) 8554; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8555; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rdx) 8556; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8557; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rdx) 8558; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8559; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rdx) 8560; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8561; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rdx) 8562; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8563; AVX2-FCP-NEXT: vmovaps %ymm0, (%rdx) 8564; AVX2-FCP-NEXT: addq $1704, %rsp # imm = 0x6A8 8565; AVX2-FCP-NEXT: vzeroupper 8566; AVX2-FCP-NEXT: retq 8567; 8568; AVX512-LABEL: store_i64_stride8_vf32: 8569; AVX512: # %bb.0: 8570; AVX512-NEXT: subq $2312, %rsp # imm = 0x908 8571; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 8572; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 8573; AVX512-NEXT: vmovaps 128(%rdi), %zmm0 8574; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8575; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm6 8576; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 8577; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm0 8578; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 8579; AVX512-NEXT: vmovdqa64 (%rsi), %zmm3 8580; AVX512-NEXT: vmovaps 192(%rdx), %zmm1 8581; AVX512-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8582; AVX512-NEXT: vmovaps 128(%rdx), %zmm1 8583; AVX512-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8584; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm7 8585; AVX512-NEXT: vmovdqa64 (%rdx), %zmm4 8586; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm23 8587; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm1 8588; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm9 8589; AVX512-NEXT: vmovdqa64 (%rcx), %zmm8 8590; AVX512-NEXT: vmovdqa64 (%r8), %zmm18 8591; AVX512-NEXT: vmovdqa64 64(%r8), %zmm16 8592; AVX512-NEXT: vmovdqa64 (%r9), %zmm12 8593; AVX512-NEXT: vmovdqa64 64(%r9), %zmm19 8594; AVX512-NEXT: vmovdqa64 (%r10), %zmm25 8595; AVX512-NEXT: vmovdqa64 64(%r10), %zmm17 8596; AVX512-NEXT: vmovdqa64 (%rax), %zmm15 8597; AVX512-NEXT: vmovdqa64 64(%rax), %zmm20 8598; AVX512-NEXT: movb $-64, %r11b 8599; AVX512-NEXT: kmovw %r11d, %k1 8600; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] 8601; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8602; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 8603; AVX512-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 8604; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 8605; AVX512-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 8606; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 8607; AVX512-NEXT: vmovdqa64 %zmm25, %zmm11 8608; AVX512-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 8609; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] 8610; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} 8611; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 8612; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8613; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] 8614; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8615; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 8616; AVX512-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 8617; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 8618; AVX512-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 8619; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 8620; AVX512-NEXT: vmovdqa64 %zmm25, %zmm11 8621; AVX512-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 8622; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] 8623; AVX512-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} 8624; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 8625; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8626; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] 8627; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8628; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 8629; AVX512-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 8630; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 8631; AVX512-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 8632; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 8633; AVX512-NEXT: vmovdqa64 %zmm18, %zmm11 8634; AVX512-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 8635; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] 8636; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 8637; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8638; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] 8639; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8640; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 8641; AVX512-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 8642; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 8643; AVX512-NEXT: vmovdqa64 %zmm18, %zmm3 8644; AVX512-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 8645; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] 8646; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 8647; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8648; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 8649; AVX512-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 8650; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 8651; AVX512-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 8652; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 8653; AVX512-NEXT: vmovdqa64 %zmm17, %zmm3 8654; AVX512-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 8655; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] 8656; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 8657; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 8658; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8659; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 8660; AVX512-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 8661; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 8662; AVX512-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 8663; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 8664; AVX512-NEXT: vmovdqa64 %zmm17, %zmm3 8665; AVX512-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 8666; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] 8667; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 8668; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 8669; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8670; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 8671; AVX512-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 8672; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 8673; AVX512-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 8674; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 8675; AVX512-NEXT: vmovdqa64 %zmm16, %zmm3 8676; AVX512-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 8677; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] 8678; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 8679; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8680; AVX512-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 8681; AVX512-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 8682; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] 8683; AVX512-NEXT: vmovdqa64 %zmm16, %zmm3 8684; AVX512-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 8685; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] 8686; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 8687; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8688; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 8689; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 8690; AVX512-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 8691; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8692; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 8693; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 8694; AVX512-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 8695; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8696; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 8697; AVX512-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 8698; AVX512-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill 8699; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 8700; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 8701; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8702; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 8703; AVX512-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 8704; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8705; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 8706; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 8707; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8708; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 8709; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8710; AVX512-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 8711; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8712; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8713; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 8714; AVX512-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 8715; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8716; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 8717; AVX512-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 8718; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8719; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 8720; AVX512-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 8721; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8722; AVX512-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 8723; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8724; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm1 8725; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm0 8726; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 8727; AVX512-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 8728; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8729; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 8730; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 8731; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8732; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 8733; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 8734; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8735; AVX512-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 8736; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8737; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] 8738; AVX512-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8739; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 8740; AVX512-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 8741; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8742; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] 8743; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8744; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 8745; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 8746; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8747; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] 8748; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8749; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 8750; AVX512-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 8751; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8752; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] 8753; AVX512-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8754; AVX512-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 8755; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8756; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 8757; AVX512-NEXT: vmovdqa64 %zmm18, %zmm29 8758; AVX512-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 8759; AVX512-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 8760; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 8761; AVX512-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 8762; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8763; AVX512-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 8764; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8765; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 8766; AVX512-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 8767; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8768; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 8769; AVX512-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 8770; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8771; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 8772; AVX512-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 8773; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8774; AVX512-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 8775; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8776; AVX512-NEXT: vmovdqa64 %zmm16, %zmm14 8777; AVX512-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 8778; AVX512-NEXT: vmovdqa64 %zmm16, %zmm11 8779; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 8780; AVX512-NEXT: vmovdqa64 %zmm16, %zmm25 8781; AVX512-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 8782; AVX512-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 8783; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8784; AVX512-NEXT: vmovdqa64 128(%r10), %zmm19 8785; AVX512-NEXT: vmovdqa64 128(%rax), %zmm0 8786; AVX512-NEXT: vmovdqa64 %zmm19, %zmm28 8787; AVX512-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 8788; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 8789; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 8790; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8791; AVX512-NEXT: vmovdqa64 %zmm19, %zmm13 8792; AVX512-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 8793; AVX512-NEXT: vmovdqa64 %zmm19, %zmm16 8794; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 8795; AVX512-NEXT: vmovdqa64 128(%r8), %zmm20 8796; AVX512-NEXT: vmovdqa64 128(%r9), %zmm1 8797; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 8798; AVX512-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 8799; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] 8800; AVX512-NEXT: vmovdqa64 %zmm20, %zmm10 8801; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 8802; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] 8803; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2 8804; AVX512-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 8805; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8806; AVX512-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 8807; AVX512-NEXT: vmovdqa64 %zmm20, %zmm4 8808; AVX512-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 8809; AVX512-NEXT: vmovdqa64 %zmm20, %zmm6 8810; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 8811; AVX512-NEXT: vmovdqa64 %zmm20, %zmm17 8812; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 8813; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] 8814; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] 8815; AVX512-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 8816; AVX512-NEXT: vmovdqa64 192(%r10), %zmm8 8817; AVX512-NEXT: vmovdqa64 192(%rax), %zmm1 8818; AVX512-NEXT: vmovdqa64 %zmm8, %zmm12 8819; AVX512-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 8820; AVX512-NEXT: vmovdqa64 %zmm8, %zmm15 8821; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 8822; AVX512-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 8823; AVX512-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 8824; AVX512-NEXT: vmovdqa64 192(%r8), %zmm2 8825; AVX512-NEXT: vmovdqa64 192(%r9), %zmm0 8826; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 8827; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] 8828; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 8829; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] 8830; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 8831; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 8832; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8833; AVX512-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 8834; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 8835; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 8836; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 8837; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] 8838; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] 8839; AVX512-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 8840; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8841; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8842; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 8843; AVX512-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} 8844; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 8845; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8846; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8847; AVX512-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 8848; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 8849; AVX512-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} 8850; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 8851; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8852; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8853; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8854; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 8855; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 8856; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8857; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8858; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} 8859; AVX512-NEXT: vmovdqa (%rsi), %xmm0 8860; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 8861; AVX512-NEXT: vmovdqa (%rdi), %xmm0 8862; AVX512-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 8863; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 8864; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 8865; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8866; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8867; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} 8868; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 8869; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 8870; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 8871; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8872; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} 8873; AVX512-NEXT: vmovdqa 64(%rsi), %xmm3 8874; AVX512-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 8875; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5 8876; AVX512-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 8877; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 8878; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 8879; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8880; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} 8881; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 8882; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 8883; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} 8884; AVX512-NEXT: vmovdqa 128(%rsi), %xmm3 8885; AVX512-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 8886; AVX512-NEXT: vmovdqa 128(%rdi), %xmm5 8887; AVX512-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 8888; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 8889; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 8890; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8891; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} 8892; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 8893; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 8894; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8895; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 8896; AVX512-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] 8897; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 8898; AVX512-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} 8899; AVX512-NEXT: vmovdqa 192(%rsi), %xmm4 8900; AVX512-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 8901; AVX512-NEXT: vmovdqa 192(%rdi), %xmm4 8902; AVX512-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 8903; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] 8904; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 8905; AVX512-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} 8906; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] 8907; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 8908; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8909; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 8910; AVX512-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] 8911; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} 8912; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 8913; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8914; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 8915; AVX512-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 8916; AVX512-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} 8917; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 8918; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8919; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 8920; AVX512-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 8921; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 8922; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8923; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 8924; AVX512-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 8925; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 8926; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8927; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 8928; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 8929; AVX512-NEXT: vmovdqa (%rcx), %ymm1 8930; AVX512-NEXT: vmovdqa (%rdx), %ymm15 8931; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] 8932; AVX512-NEXT: vmovdqa64 (%rsi), %ymm21 8933; AVX512-NEXT: vmovdqa64 (%rdi), %ymm22 8934; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 8935; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] 8936; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 8937; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 8938; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 8939; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 8940; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] 8941; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 8942; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] 8943; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 8944; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 8945; AVX512-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} 8946; AVX512-NEXT: vmovdqa 64(%rcx), %ymm14 8947; AVX512-NEXT: vmovdqa 64(%rdx), %ymm15 8948; AVX512-NEXT: vmovdqa64 64(%rsi), %ymm21 8949; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm22 8950; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 8951; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 8952; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] 8953; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 8954; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 8955; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 8956; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 8957; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 8958; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 8959; AVX512-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] 8960; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 8961; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 8962; AVX512-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} 8963; AVX512-NEXT: vmovdqa 128(%rcx), %ymm14 8964; AVX512-NEXT: vmovdqa 128(%rdx), %ymm15 8965; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 8966; AVX512-NEXT: vmovdqa64 128(%rsi), %ymm21 8967; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm22 8968; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 8969; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] 8970; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 8971; AVX512-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} 8972; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 8973; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 8974; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] 8975; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 8976; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 8977; AVX512-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} 8978; AVX512-NEXT: vmovdqa 192(%rcx), %ymm14 8979; AVX512-NEXT: vmovdqa 192(%rdx), %ymm15 8980; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 8981; AVX512-NEXT: vmovdqa64 192(%rsi), %ymm17 8982; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm19 8983; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] 8984; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] 8985; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 8986; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} 8987; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 8988; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] 8989; AVX512-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] 8990; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 8991; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 8992; AVX512-NEXT: vmovdqa64 %zmm2, 1728(%rax) 8993; AVX512-NEXT: vmovdqa64 %zmm5, 1664(%rax) 8994; AVX512-NEXT: vmovdqa64 %zmm4, 1216(%rax) 8995; AVX512-NEXT: vmovdqa64 %zmm3, 1152(%rax) 8996; AVX512-NEXT: vmovdqa64 %zmm13, 704(%rax) 8997; AVX512-NEXT: vmovdqa64 %zmm11, 640(%rax) 8998; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) 8999; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) 9000; AVX512-NEXT: vmovdqa64 %zmm12, 1984(%rax) 9001; AVX512-NEXT: vmovdqa64 %zmm9, 1920(%rax) 9002; AVX512-NEXT: vmovdqa64 %zmm7, 1856(%rax) 9003; AVX512-NEXT: vmovdqa64 %zmm6, 1792(%rax) 9004; AVX512-NEXT: vmovdqa64 %zmm31, 1600(%rax) 9005; AVX512-NEXT: vmovdqa64 %zmm30, 1536(%rax) 9006; AVX512-NEXT: vmovdqa64 %zmm16, 1472(%rax) 9007; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9008; AVX512-NEXT: vmovaps %zmm0, 1408(%rax) 9009; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9010; AVX512-NEXT: vmovaps %zmm0, 1344(%rax) 9011; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9012; AVX512-NEXT: vmovaps %zmm0, 1280(%rax) 9013; AVX512-NEXT: vmovdqa64 %zmm18, 1088(%rax) 9014; AVX512-NEXT: vmovdqa64 %zmm28, 1024(%rax) 9015; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9016; AVX512-NEXT: vmovaps %zmm0, 960(%rax) 9017; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9018; AVX512-NEXT: vmovaps %zmm0, 896(%rax) 9019; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9020; AVX512-NEXT: vmovaps %zmm0, 832(%rax) 9021; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9022; AVX512-NEXT: vmovaps %zmm0, 768(%rax) 9023; AVX512-NEXT: vmovdqa64 %zmm26, 576(%rax) 9024; AVX512-NEXT: vmovdqa64 %zmm29, 512(%rax) 9025; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9026; AVX512-NEXT: vmovaps %zmm0, 448(%rax) 9027; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9028; AVX512-NEXT: vmovaps %zmm0, 384(%rax) 9029; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9030; AVX512-NEXT: vmovaps %zmm0, 320(%rax) 9031; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9032; AVX512-NEXT: vmovaps %zmm0, 256(%rax) 9033; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload 9034; AVX512-NEXT: vmovaps %zmm0, 64(%rax) 9035; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9036; AVX512-NEXT: vmovaps %zmm0, (%rax) 9037; AVX512-NEXT: addq $2312, %rsp # imm = 0x908 9038; AVX512-NEXT: vzeroupper 9039; AVX512-NEXT: retq 9040; 9041; AVX512-FCP-LABEL: store_i64_stride8_vf32: 9042; AVX512-FCP: # %bb.0: 9043; AVX512-FCP-NEXT: subq $2312, %rsp # imm = 0x908 9044; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 9045; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 9046; AVX512-FCP-NEXT: vmovaps 128(%rdi), %zmm0 9047; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9048; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 9049; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 9050; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 9051; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 9052; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 9053; AVX512-FCP-NEXT: vmovaps 192(%rdx), %zmm1 9054; AVX512-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9055; AVX512-FCP-NEXT: vmovaps 128(%rdx), %zmm1 9056; AVX512-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9057; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 9058; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 9059; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm23 9060; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm1 9061; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 9062; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm8 9063; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm18 9064; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 9065; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm12 9066; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm19 9067; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm25 9068; AVX512-FCP-NEXT: vmovdqa64 64(%r10), %zmm17 9069; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm15 9070; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm20 9071; AVX512-FCP-NEXT: movb $-64, %r11b 9072; AVX512-FCP-NEXT: kmovw %r11d, %k1 9073; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] 9074; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9075; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 9076; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 9077; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 9078; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 9079; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 9080; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 9081; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 9082; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] 9083; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} 9084; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 9085; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9086; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] 9087; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9088; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 9089; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 9090; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 9091; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 9092; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 9093; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 9094; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 9095; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] 9096; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} 9097; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 9098; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9099; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] 9100; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9101; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 9102; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 9103; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 9104; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 9105; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 9106; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm11 9107; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 9108; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] 9109; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 9110; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9111; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] 9112; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9113; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 9114; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 9115; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 9116; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 9117; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 9118; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] 9119; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 9120; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9121; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 9122; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 9123; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 9124; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 9125; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 9126; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 9127; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 9128; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] 9129; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 9130; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 9131; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9132; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 9133; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 9134; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 9135; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 9136; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 9137; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 9138; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 9139; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] 9140; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 9141; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 9142; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9143; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 9144; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 9145; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 9146; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 9147; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 9148; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 9149; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 9150; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] 9151; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 9152; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9153; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 9154; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 9155; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] 9156; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 9157; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 9158; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] 9159; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 9160; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9161; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9162; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 9163; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 9164; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9165; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 9166; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 9167; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 9168; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9169; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 9170; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 9171; AVX512-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill 9172; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 9173; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 9174; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9175; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 9176; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 9177; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9178; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 9179; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 9180; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9181; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 9182; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9183; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 9184; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9185; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9186; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 9187; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 9188; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9189; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 9190; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 9191; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9192; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 9193; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 9194; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9195; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 9196; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9197; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 9198; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm0 9199; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 9200; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 9201; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9202; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 9203; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 9204; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9205; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 9206; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 9207; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9208; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 9209; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9210; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] 9211; AVX512-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9212; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 9213; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 9214; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9215; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] 9216; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9217; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 9218; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 9219; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9220; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] 9221; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9222; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 9223; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 9224; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9225; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] 9226; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9227; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 9228; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9229; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 9230; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 9231; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 9232; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 9233; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 9234; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 9235; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9236; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 9237; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9238; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 9239; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 9240; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9241; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 9242; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 9243; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9244; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 9245; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 9246; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9247; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 9248; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9249; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 9250; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 9251; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 9252; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 9253; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 9254; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 9255; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 9256; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9257; AVX512-FCP-NEXT: vmovdqa64 128(%r10), %zmm19 9258; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm0 9259; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 9260; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 9261; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 9262; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 9263; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9264; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 9265; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 9266; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 9267; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 9268; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm20 9269; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm1 9270; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 9271; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 9272; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] 9273; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 9274; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 9275; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] 9276; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 9277; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 9278; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9279; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 9280; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm4 9281; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 9282; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 9283; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 9284; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 9285; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 9286; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] 9287; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] 9288; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 9289; AVX512-FCP-NEXT: vmovdqa64 192(%r10), %zmm8 9290; AVX512-FCP-NEXT: vmovdqa64 192(%rax), %zmm1 9291; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 9292; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 9293; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 9294; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 9295; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 9296; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 9297; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 9298; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 9299; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 9300; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] 9301; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 9302; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] 9303; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 9304; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 9305; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9306; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 9307; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 9308; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 9309; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 9310; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] 9311; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] 9312; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 9313; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9314; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9315; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 9316; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} 9317; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 9318; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9319; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9320; AVX512-FCP-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 9321; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 9322; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} 9323; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 9324; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9325; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9326; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9327; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 9328; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 9329; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9330; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9331; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} 9332; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0 9333; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 9334; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 9335; AVX512-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 9336; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 9337; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 9338; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9339; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9340; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} 9341; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 9342; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 9343; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 9344; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9345; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} 9346; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 9347; AVX512-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 9348; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 9349; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 9350; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 9351; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 9352; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9353; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} 9354; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 9355; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 9356; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} 9357; AVX512-FCP-NEXT: vmovdqa 128(%rsi), %xmm3 9358; AVX512-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 9359; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 9360; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 9361; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 9362; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 9363; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9364; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} 9365; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 9366; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 9367; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9368; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 9369; AVX512-FCP-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] 9370; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 9371; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} 9372; AVX512-FCP-NEXT: vmovdqa 192(%rsi), %xmm4 9373; AVX512-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 9374; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm4 9375; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 9376; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] 9377; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 9378; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} 9379; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] 9380; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 9381; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9382; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 9383; AVX512-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] 9384; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} 9385; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 9386; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9387; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 9388; AVX512-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 9389; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} 9390; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 9391; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9392; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 9393; AVX512-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 9394; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 9395; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9396; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 9397; AVX512-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 9398; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 9399; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9400; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9401; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 9402; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm1 9403; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm15 9404; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] 9405; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %ymm21 9406; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 9407; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 9408; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] 9409; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 9410; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9411; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 9412; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 9413; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] 9414; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 9415; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] 9416; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 9417; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9418; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} 9419; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm14 9420; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm15 9421; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %ymm21 9422; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 9423; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 9424; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 9425; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] 9426; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 9427; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9428; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 9429; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 9430; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 9431; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 9432; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] 9433; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 9434; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9435; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} 9436; AVX512-FCP-NEXT: vmovdqa 128(%rcx), %ymm14 9437; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %ymm15 9438; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 9439; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %ymm21 9440; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm22 9441; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 9442; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] 9443; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 9444; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} 9445; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 9446; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 9447; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] 9448; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 9449; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 9450; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} 9451; AVX512-FCP-NEXT: vmovdqa 192(%rcx), %ymm14 9452; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %ymm15 9453; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 9454; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %ymm17 9455; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 9456; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] 9457; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] 9458; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 9459; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} 9460; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 9461; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] 9462; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] 9463; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 9464; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 9465; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 1728(%rax) 9466; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 1664(%rax) 9467; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 1216(%rax) 9468; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 1152(%rax) 9469; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 704(%rax) 9470; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 640(%rax) 9471; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) 9472; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) 9473; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 1984(%rax) 9474; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 1920(%rax) 9475; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 1856(%rax) 9476; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 1792(%rax) 9477; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 1600(%rax) 9478; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 1536(%rax) 9479; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 1472(%rax) 9480; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9481; AVX512-FCP-NEXT: vmovaps %zmm0, 1408(%rax) 9482; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9483; AVX512-FCP-NEXT: vmovaps %zmm0, 1344(%rax) 9484; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9485; AVX512-FCP-NEXT: vmovaps %zmm0, 1280(%rax) 9486; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 1088(%rax) 9487; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) 9488; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9489; AVX512-FCP-NEXT: vmovaps %zmm0, 960(%rax) 9490; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9491; AVX512-FCP-NEXT: vmovaps %zmm0, 896(%rax) 9492; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9493; AVX512-FCP-NEXT: vmovaps %zmm0, 832(%rax) 9494; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9495; AVX512-FCP-NEXT: vmovaps %zmm0, 768(%rax) 9496; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 576(%rax) 9497; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 512(%rax) 9498; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9499; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rax) 9500; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9501; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%rax) 9502; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9503; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%rax) 9504; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9505; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%rax) 9506; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload 9507; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax) 9508; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9509; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax) 9510; AVX512-FCP-NEXT: addq $2312, %rsp # imm = 0x908 9511; AVX512-FCP-NEXT: vzeroupper 9512; AVX512-FCP-NEXT: retq 9513; 9514; AVX512DQ-LABEL: store_i64_stride8_vf32: 9515; AVX512DQ: # %bb.0: 9516; AVX512DQ-NEXT: subq $2312, %rsp # imm = 0x908 9517; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 9518; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 9519; AVX512DQ-NEXT: vmovaps 128(%rdi), %zmm0 9520; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9521; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm6 9522; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 9523; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm0 9524; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm5 9525; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm3 9526; AVX512DQ-NEXT: vmovaps 192(%rdx), %zmm1 9527; AVX512DQ-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9528; AVX512DQ-NEXT: vmovaps 128(%rdx), %zmm1 9529; AVX512DQ-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9530; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm7 9531; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm4 9532; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm23 9533; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm1 9534; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm9 9535; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm8 9536; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm18 9537; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm16 9538; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm12 9539; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm19 9540; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm25 9541; AVX512DQ-NEXT: vmovdqa64 64(%r10), %zmm17 9542; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm15 9543; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm20 9544; AVX512DQ-NEXT: movb $-64, %r11b 9545; AVX512DQ-NEXT: kmovw %r11d, %k1 9546; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] 9547; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9548; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 9549; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 9550; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 9551; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 9552; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 9553; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm11 9554; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 9555; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] 9556; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} 9557; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 9558; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9559; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] 9560; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9561; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 9562; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 9563; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 9564; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 9565; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 9566; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm11 9567; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 9568; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] 9569; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} 9570; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 9571; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9572; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] 9573; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9574; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 9575; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 9576; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 9577; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 9578; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 9579; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm11 9580; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 9581; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] 9582; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 9583; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9584; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] 9585; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9586; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 9587; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 9588; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 9589; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm3 9590; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 9591; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] 9592; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 9593; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9594; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 9595; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 9596; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 9597; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 9598; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 9599; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm3 9600; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 9601; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] 9602; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 9603; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 9604; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9605; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 9606; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 9607; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 9608; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 9609; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 9610; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm3 9611; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 9612; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] 9613; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 9614; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 9615; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9616; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 9617; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 9618; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 9619; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 9620; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 9621; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm3 9622; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 9623; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] 9624; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 9625; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9626; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 9627; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 9628; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] 9629; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm3 9630; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 9631; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] 9632; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 9633; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9634; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9635; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 9636; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 9637; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9638; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 9639; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 9640; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 9641; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9642; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 9643; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 9644; AVX512DQ-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill 9645; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 9646; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 9647; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9648; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 9649; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 9650; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9651; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 9652; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 9653; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9654; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 9655; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9656; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 9657; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9658; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9659; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 9660; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 9661; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9662; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 9663; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 9664; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9665; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 9666; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 9667; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9668; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 9669; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9670; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm1 9671; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm0 9672; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 9673; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 9674; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9675; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 9676; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 9677; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9678; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 9679; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 9680; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9681; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 9682; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9683; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] 9684; AVX512DQ-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9685; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 9686; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 9687; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9688; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] 9689; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9690; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 9691; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 9692; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9693; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] 9694; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9695; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 9696; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 9697; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9698; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] 9699; AVX512DQ-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9700; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 9701; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9702; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 9703; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm29 9704; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 9705; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 9706; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 9707; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 9708; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9709; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 9710; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9711; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 9712; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 9713; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9714; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 9715; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 9716; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9717; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 9718; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 9719; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9720; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 9721; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9722; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm14 9723; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 9724; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm11 9725; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 9726; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm25 9727; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 9728; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 9729; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9730; AVX512DQ-NEXT: vmovdqa64 128(%r10), %zmm19 9731; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm0 9732; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm28 9733; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 9734; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 9735; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 9736; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9737; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm13 9738; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 9739; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm16 9740; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 9741; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm20 9742; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm1 9743; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 9744; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 9745; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] 9746; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm10 9747; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 9748; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] 9749; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm2 9750; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 9751; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9752; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 9753; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm4 9754; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 9755; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm6 9756; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 9757; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm17 9758; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 9759; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] 9760; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] 9761; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 9762; AVX512DQ-NEXT: vmovdqa64 192(%r10), %zmm8 9763; AVX512DQ-NEXT: vmovdqa64 192(%rax), %zmm1 9764; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 9765; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 9766; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm15 9767; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 9768; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 9769; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 9770; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm2 9771; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm0 9772; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 9773; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] 9774; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 9775; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] 9776; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 9777; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 9778; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9779; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 9780; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 9781; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 9782; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 9783; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] 9784; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] 9785; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 9786; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9787; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9788; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 9789; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} 9790; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 9791; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9792; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9793; AVX512DQ-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 9794; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 9795; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} 9796; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 9797; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9798; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9799; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9800; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 9801; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 9802; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9803; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9804; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} 9805; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 9806; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 9807; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 9808; AVX512DQ-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 9809; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 9810; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 9811; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9812; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9813; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} 9814; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 9815; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 9816; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 9817; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9818; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} 9819; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm3 9820; AVX512DQ-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 9821; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5 9822; AVX512DQ-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 9823; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 9824; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 9825; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9826; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} 9827; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 9828; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 9829; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} 9830; AVX512DQ-NEXT: vmovdqa 128(%rsi), %xmm3 9831; AVX512DQ-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 9832; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm5 9833; AVX512DQ-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 9834; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 9835; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 9836; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9837; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} 9838; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 9839; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 9840; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9841; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 9842; AVX512DQ-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] 9843; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 9844; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} 9845; AVX512DQ-NEXT: vmovdqa 192(%rsi), %xmm4 9846; AVX512DQ-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 9847; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm4 9848; AVX512DQ-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 9849; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] 9850; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 9851; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} 9852; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] 9853; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 9854; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9855; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 9856; AVX512DQ-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] 9857; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} 9858; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 9859; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9860; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 9861; AVX512DQ-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 9862; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} 9863; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 9864; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9865; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 9866; AVX512DQ-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 9867; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 9868; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9869; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 9870; AVX512DQ-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 9871; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 9872; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9873; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9874; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 9875; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm1 9876; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm15 9877; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] 9878; AVX512DQ-NEXT: vmovdqa64 (%rsi), %ymm21 9879; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm22 9880; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 9881; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] 9882; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 9883; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9884; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 9885; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 9886; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] 9887; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 9888; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] 9889; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 9890; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9891; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} 9892; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm14 9893; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm15 9894; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %ymm21 9895; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm22 9896; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 9897; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 9898; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] 9899; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 9900; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9901; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 9902; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 9903; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 9904; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 9905; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] 9906; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 9907; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9908; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} 9909; AVX512DQ-NEXT: vmovdqa 128(%rcx), %ymm14 9910; AVX512DQ-NEXT: vmovdqa 128(%rdx), %ymm15 9911; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 9912; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %ymm21 9913; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm22 9914; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 9915; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] 9916; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 9917; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} 9918; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 9919; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 9920; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] 9921; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 9922; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 9923; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} 9924; AVX512DQ-NEXT: vmovdqa 192(%rcx), %ymm14 9925; AVX512DQ-NEXT: vmovdqa 192(%rdx), %ymm15 9926; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 9927; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %ymm17 9928; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm19 9929; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] 9930; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] 9931; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 9932; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} 9933; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 9934; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] 9935; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] 9936; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 9937; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 9938; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1728(%rax) 9939; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1664(%rax) 9940; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1216(%rax) 9941; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1152(%rax) 9942; AVX512DQ-NEXT: vmovdqa64 %zmm13, 704(%rax) 9943; AVX512DQ-NEXT: vmovdqa64 %zmm11, 640(%rax) 9944; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) 9945; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) 9946; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1984(%rax) 9947; AVX512DQ-NEXT: vmovdqa64 %zmm9, 1920(%rax) 9948; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1856(%rax) 9949; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1792(%rax) 9950; AVX512DQ-NEXT: vmovdqa64 %zmm31, 1600(%rax) 9951; AVX512DQ-NEXT: vmovdqa64 %zmm30, 1536(%rax) 9952; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1472(%rax) 9953; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9954; AVX512DQ-NEXT: vmovaps %zmm0, 1408(%rax) 9955; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9956; AVX512DQ-NEXT: vmovaps %zmm0, 1344(%rax) 9957; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9958; AVX512DQ-NEXT: vmovaps %zmm0, 1280(%rax) 9959; AVX512DQ-NEXT: vmovdqa64 %zmm18, 1088(%rax) 9960; AVX512DQ-NEXT: vmovdqa64 %zmm28, 1024(%rax) 9961; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9962; AVX512DQ-NEXT: vmovaps %zmm0, 960(%rax) 9963; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9964; AVX512DQ-NEXT: vmovaps %zmm0, 896(%rax) 9965; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9966; AVX512DQ-NEXT: vmovaps %zmm0, 832(%rax) 9967; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9968; AVX512DQ-NEXT: vmovaps %zmm0, 768(%rax) 9969; AVX512DQ-NEXT: vmovdqa64 %zmm26, 576(%rax) 9970; AVX512DQ-NEXT: vmovdqa64 %zmm29, 512(%rax) 9971; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9972; AVX512DQ-NEXT: vmovaps %zmm0, 448(%rax) 9973; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9974; AVX512DQ-NEXT: vmovaps %zmm0, 384(%rax) 9975; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9976; AVX512DQ-NEXT: vmovaps %zmm0, 320(%rax) 9977; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9978; AVX512DQ-NEXT: vmovaps %zmm0, 256(%rax) 9979; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload 9980; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax) 9981; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9982; AVX512DQ-NEXT: vmovaps %zmm0, (%rax) 9983; AVX512DQ-NEXT: addq $2312, %rsp # imm = 0x908 9984; AVX512DQ-NEXT: vzeroupper 9985; AVX512DQ-NEXT: retq 9986; 9987; AVX512DQ-FCP-LABEL: store_i64_stride8_vf32: 9988; AVX512DQ-FCP: # %bb.0: 9989; AVX512DQ-FCP-NEXT: subq $2312, %rsp # imm = 0x908 9990; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 9991; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 9992; AVX512DQ-FCP-NEXT: vmovaps 128(%rdi), %zmm0 9993; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9994; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 9995; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 9996; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 9997; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 9998; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 9999; AVX512DQ-FCP-NEXT: vmovaps 192(%rdx), %zmm1 10000; AVX512DQ-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10001; AVX512DQ-FCP-NEXT: vmovaps 128(%rdx), %zmm1 10002; AVX512DQ-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10003; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 10004; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 10005; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm23 10006; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm1 10007; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 10008; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm8 10009; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm18 10010; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 10011; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm12 10012; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm19 10013; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm25 10014; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r10), %zmm17 10015; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm15 10016; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm20 10017; AVX512DQ-FCP-NEXT: movb $-64, %r11b 10018; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1 10019; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] 10020; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10021; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 10022; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 10023; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 10024; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 10025; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 10026; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 10027; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 10028; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] 10029; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} 10030; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 10031; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10032; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] 10033; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10034; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 10035; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 10036; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 10037; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 10038; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 10039; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 10040; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 10041; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] 10042; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} 10043; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 10044; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10045; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] 10046; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10047; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 10048; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 10049; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 10050; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 10051; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 10052; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm11 10053; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 10054; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] 10055; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 10056; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10057; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] 10058; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10059; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 10060; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 10061; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 10062; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 10063; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 10064; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] 10065; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 10066; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10067; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 10068; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 10069; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 10070; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 10071; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 10072; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 10073; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 10074; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] 10075; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 10076; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 10077; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10078; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 10079; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 10080; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 10081; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 10082; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 10083; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 10084; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 10085; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] 10086; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 10087; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 10088; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10089; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 10090; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 10091; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 10092; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 10093; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 10094; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 10095; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 10096; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] 10097; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 10098; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10099; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 10100; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 10101; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] 10102; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 10103; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 10104; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] 10105; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 10106; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10107; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 10108; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 10109; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 10110; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10111; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 10112; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 10113; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 10114; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10115; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 10116; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 10117; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill 10118; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 10119; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 10120; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10121; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 10122; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 10123; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10124; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 10125; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 10126; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10127; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 10128; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10129; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 10130; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10131; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10132; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 10133; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 10134; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10135; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 10136; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 10137; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10138; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 10139; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 10140; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10141; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 10142; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10143; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 10144; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm0 10145; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 10146; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 10147; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10148; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 10149; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 10150; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10151; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 10152; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 10153; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10154; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 10155; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10156; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] 10157; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10158; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 10159; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 10160; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10161; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] 10162; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10163; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 10164; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 10165; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10166; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] 10167; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10168; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 10169; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 10170; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10171; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] 10172; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10173; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 10174; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10175; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 10176; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 10177; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 10178; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 10179; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 10180; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 10181; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10182; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 10183; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10184; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 10185; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 10186; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10187; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 10188; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 10189; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10190; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 10191; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 10192; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10193; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 10194; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10195; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 10196; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 10197; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 10198; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 10199; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 10200; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 10201; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 10202; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10203; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r10), %zmm19 10204; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm0 10205; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 10206; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 10207; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 10208; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 10209; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10210; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 10211; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 10212; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 10213; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 10214; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm20 10215; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm1 10216; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 10217; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 10218; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] 10219; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 10220; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 10221; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] 10222; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 10223; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 10224; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10225; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 10226; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm4 10227; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 10228; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 10229; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 10230; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 10231; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 10232; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] 10233; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] 10234; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 10235; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r10), %zmm8 10236; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rax), %zmm1 10237; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 10238; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 10239; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 10240; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 10241; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 10242; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 10243; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 10244; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 10245; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 10246; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] 10247; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 10248; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] 10249; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 10250; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 10251; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10252; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 10253; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 10254; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 10255; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 10256; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] 10257; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] 10258; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 10259; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10260; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10261; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 10262; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} 10263; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 10264; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10265; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10266; AVX512DQ-FCP-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 10267; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 10268; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} 10269; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 10270; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10271; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10272; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10273; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 10274; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 10275; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10276; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10277; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} 10278; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0 10279; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 10280; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 10281; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 10282; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 10283; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 10284; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10285; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10286; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} 10287; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 10288; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 10289; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 10290; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10291; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} 10292; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 10293; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 10294; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 10295; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 10296; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 10297; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 10298; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10299; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} 10300; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 10301; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 10302; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} 10303; AVX512DQ-FCP-NEXT: vmovdqa 128(%rsi), %xmm3 10304; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 10305; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 10306; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 10307; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 10308; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 10309; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10310; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} 10311; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 10312; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 10313; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10314; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 10315; AVX512DQ-FCP-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] 10316; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 10317; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} 10318; AVX512DQ-FCP-NEXT: vmovdqa 192(%rsi), %xmm4 10319; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 10320; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm4 10321; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 10322; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] 10323; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 10324; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} 10325; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] 10326; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 10327; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10328; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 10329; AVX512DQ-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] 10330; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} 10331; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 10332; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10333; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 10334; AVX512DQ-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 10335; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} 10336; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 10337; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10338; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 10339; AVX512DQ-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 10340; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 10341; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10342; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 10343; AVX512DQ-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 10344; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 10345; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10346; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 10347; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 10348; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm1 10349; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm15 10350; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] 10351; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %ymm21 10352; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 10353; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 10354; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] 10355; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 10356; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 10357; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 10358; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 10359; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] 10360; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 10361; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] 10362; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 10363; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 10364; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} 10365; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm14 10366; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm15 10367; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %ymm21 10368; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 10369; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 10370; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 10371; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] 10372; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 10373; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 10374; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 10375; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 10376; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 10377; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 10378; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] 10379; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 10380; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 10381; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} 10382; AVX512DQ-FCP-NEXT: vmovdqa 128(%rcx), %ymm14 10383; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdx), %ymm15 10384; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 10385; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %ymm21 10386; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm22 10387; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 10388; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] 10389; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 10390; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} 10391; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 10392; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 10393; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] 10394; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 10395; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 10396; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} 10397; AVX512DQ-FCP-NEXT: vmovdqa 192(%rcx), %ymm14 10398; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %ymm15 10399; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 10400; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %ymm17 10401; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 10402; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] 10403; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] 10404; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 10405; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} 10406; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 10407; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] 10408; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] 10409; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 10410; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 10411; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 1728(%rax) 10412; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 1664(%rax) 10413; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 1216(%rax) 10414; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 1152(%rax) 10415; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 704(%rax) 10416; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 640(%rax) 10417; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) 10418; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) 10419; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 1984(%rax) 10420; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 1920(%rax) 10421; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 1856(%rax) 10422; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 1792(%rax) 10423; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 1600(%rax) 10424; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 1536(%rax) 10425; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 1472(%rax) 10426; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10427; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1408(%rax) 10428; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10429; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1344(%rax) 10430; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10431; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1280(%rax) 10432; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 1088(%rax) 10433; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) 10434; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10435; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 960(%rax) 10436; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10437; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 896(%rax) 10438; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10439; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 832(%rax) 10440; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10441; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 768(%rax) 10442; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 576(%rax) 10443; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 512(%rax) 10444; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10445; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rax) 10446; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10447; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%rax) 10448; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10449; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%rax) 10450; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10451; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%rax) 10452; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload 10453; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax) 10454; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10455; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax) 10456; AVX512DQ-FCP-NEXT: addq $2312, %rsp # imm = 0x908 10457; AVX512DQ-FCP-NEXT: vzeroupper 10458; AVX512DQ-FCP-NEXT: retq 10459; 10460; AVX512BW-LABEL: store_i64_stride8_vf32: 10461; AVX512BW: # %bb.0: 10462; AVX512BW-NEXT: subq $2312, %rsp # imm = 0x908 10463; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 10464; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 10465; AVX512BW-NEXT: vmovaps 128(%rdi), %zmm0 10466; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10467; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 10468; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 10469; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 10470; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm5 10471; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 10472; AVX512BW-NEXT: vmovaps 192(%rdx), %zmm1 10473; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10474; AVX512BW-NEXT: vmovaps 128(%rdx), %zmm1 10475; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10476; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 10477; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 10478; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm23 10479; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm1 10480; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm9 10481; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm8 10482; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm18 10483; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm16 10484; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm12 10485; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm19 10486; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm25 10487; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm17 10488; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm15 10489; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm20 10490; AVX512BW-NEXT: movb $-64, %r11b 10491; AVX512BW-NEXT: kmovd %r11d, %k1 10492; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] 10493; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10494; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 10495; AVX512BW-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 10496; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 10497; AVX512BW-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 10498; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 10499; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm11 10500; AVX512BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 10501; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] 10502; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} 10503; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 10504; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10505; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] 10506; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10507; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 10508; AVX512BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 10509; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 10510; AVX512BW-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 10511; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 10512; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm11 10513; AVX512BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 10514; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] 10515; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} 10516; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 10517; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10518; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] 10519; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10520; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 10521; AVX512BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 10522; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 10523; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 10524; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 10525; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm11 10526; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 10527; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] 10528; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 10529; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10530; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] 10531; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10532; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 10533; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 10534; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 10535; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 10536; AVX512BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 10537; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] 10538; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 10539; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10540; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 10541; AVX512BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 10542; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 10543; AVX512BW-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 10544; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 10545; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 10546; AVX512BW-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 10547; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] 10548; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 10549; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 10550; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10551; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 10552; AVX512BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 10553; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 10554; AVX512BW-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 10555; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 10556; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 10557; AVX512BW-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 10558; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] 10559; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 10560; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 10561; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10562; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 10563; AVX512BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 10564; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 10565; AVX512BW-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 10566; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 10567; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 10568; AVX512BW-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 10569; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] 10570; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 10571; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10572; AVX512BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 10573; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 10574; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] 10575; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 10576; AVX512BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 10577; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] 10578; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 10579; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10580; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 10581; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 10582; AVX512BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 10583; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10584; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 10585; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 10586; AVX512BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 10587; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10588; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 10589; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 10590; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill 10591; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 10592; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 10593; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10594; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 10595; AVX512BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 10596; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10597; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 10598; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 10599; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10600; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 10601; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10602; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 10603; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10604; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10605; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 10606; AVX512BW-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 10607; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10608; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 10609; AVX512BW-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 10610; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10611; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 10612; AVX512BW-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 10613; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10614; AVX512BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 10615; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10616; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 10617; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 10618; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 10619; AVX512BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 10620; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10621; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 10622; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 10623; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10624; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 10625; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 10626; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10627; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 10628; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10629; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] 10630; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10631; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 10632; AVX512BW-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 10633; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10634; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] 10635; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10636; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 10637; AVX512BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 10638; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10639; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] 10640; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10641; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 10642; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 10643; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10644; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] 10645; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10646; AVX512BW-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 10647; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10648; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 10649; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm29 10650; AVX512BW-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 10651; AVX512BW-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 10652; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 10653; AVX512BW-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 10654; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10655; AVX512BW-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 10656; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10657; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 10658; AVX512BW-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 10659; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10660; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 10661; AVX512BW-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 10662; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10663; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 10664; AVX512BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 10665; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10666; AVX512BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 10667; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10668; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 10669; AVX512BW-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 10670; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm11 10671; AVX512BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 10672; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 10673; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 10674; AVX512BW-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 10675; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10676; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm19 10677; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm0 10678; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm28 10679; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 10680; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 10681; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 10682; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10683; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 10684; AVX512BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 10685; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 10686; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 10687; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm20 10688; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm1 10689; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 10690; AVX512BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 10691; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] 10692; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 10693; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 10694; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] 10695; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 10696; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 10697; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10698; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 10699; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 10700; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 10701; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm6 10702; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 10703; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 10704; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 10705; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] 10706; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] 10707; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 10708; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm8 10709; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm1 10710; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 10711; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 10712; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 10713; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 10714; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 10715; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 10716; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2 10717; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm0 10718; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 10719; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] 10720; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 10721; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] 10722; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 10723; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 10724; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10725; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 10726; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 10727; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 10728; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 10729; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] 10730; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] 10731; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 10732; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10733; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10734; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 10735; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} 10736; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 10737; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10738; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10739; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 10740; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 10741; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} 10742; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 10743; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10744; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10745; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10746; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 10747; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 10748; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10749; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10750; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} 10751; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 10752; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 10753; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 10754; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 10755; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 10756; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 10757; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10758; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10759; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} 10760; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 10761; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 10762; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 10763; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10764; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} 10765; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm3 10766; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 10767; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm5 10768; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 10769; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 10770; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 10771; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10772; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} 10773; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 10774; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 10775; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} 10776; AVX512BW-NEXT: vmovdqa 128(%rsi), %xmm3 10777; AVX512BW-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 10778; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm5 10779; AVX512BW-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 10780; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 10781; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 10782; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10783; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} 10784; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 10785; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 10786; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10787; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 10788; AVX512BW-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] 10789; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 10790; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} 10791; AVX512BW-NEXT: vmovdqa 192(%rsi), %xmm4 10792; AVX512BW-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 10793; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm4 10794; AVX512BW-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 10795; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] 10796; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 10797; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} 10798; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] 10799; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 10800; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10801; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 10802; AVX512BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] 10803; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} 10804; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 10805; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10806; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 10807; AVX512BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 10808; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} 10809; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 10810; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10811; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 10812; AVX512BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 10813; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 10814; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10815; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 10816; AVX512BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 10817; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 10818; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10819; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 10820; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 10821; AVX512BW-NEXT: vmovdqa (%rcx), %ymm1 10822; AVX512BW-NEXT: vmovdqa (%rdx), %ymm15 10823; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] 10824; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm21 10825; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm22 10826; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 10827; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] 10828; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 10829; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 10830; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 10831; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 10832; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] 10833; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 10834; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] 10835; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 10836; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 10837; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} 10838; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm14 10839; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm15 10840; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm21 10841; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm22 10842; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 10843; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 10844; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] 10845; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 10846; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 10847; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 10848; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 10849; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 10850; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 10851; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] 10852; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 10853; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 10854; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} 10855; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm14 10856; AVX512BW-NEXT: vmovdqa 128(%rdx), %ymm15 10857; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 10858; AVX512BW-NEXT: vmovdqa64 128(%rsi), %ymm21 10859; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm22 10860; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 10861; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] 10862; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 10863; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} 10864; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 10865; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 10866; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] 10867; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 10868; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 10869; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} 10870; AVX512BW-NEXT: vmovdqa 192(%rcx), %ymm14 10871; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm15 10872; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 10873; AVX512BW-NEXT: vmovdqa64 192(%rsi), %ymm17 10874; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm19 10875; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] 10876; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] 10877; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 10878; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} 10879; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 10880; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] 10881; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] 10882; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 10883; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 10884; AVX512BW-NEXT: vmovdqa64 %zmm2, 1728(%rax) 10885; AVX512BW-NEXT: vmovdqa64 %zmm5, 1664(%rax) 10886; AVX512BW-NEXT: vmovdqa64 %zmm4, 1216(%rax) 10887; AVX512BW-NEXT: vmovdqa64 %zmm3, 1152(%rax) 10888; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rax) 10889; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rax) 10890; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) 10891; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) 10892; AVX512BW-NEXT: vmovdqa64 %zmm12, 1984(%rax) 10893; AVX512BW-NEXT: vmovdqa64 %zmm9, 1920(%rax) 10894; AVX512BW-NEXT: vmovdqa64 %zmm7, 1856(%rax) 10895; AVX512BW-NEXT: vmovdqa64 %zmm6, 1792(%rax) 10896; AVX512BW-NEXT: vmovdqa64 %zmm31, 1600(%rax) 10897; AVX512BW-NEXT: vmovdqa64 %zmm30, 1536(%rax) 10898; AVX512BW-NEXT: vmovdqa64 %zmm16, 1472(%rax) 10899; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10900; AVX512BW-NEXT: vmovaps %zmm0, 1408(%rax) 10901; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10902; AVX512BW-NEXT: vmovaps %zmm0, 1344(%rax) 10903; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10904; AVX512BW-NEXT: vmovaps %zmm0, 1280(%rax) 10905; AVX512BW-NEXT: vmovdqa64 %zmm18, 1088(%rax) 10906; AVX512BW-NEXT: vmovdqa64 %zmm28, 1024(%rax) 10907; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10908; AVX512BW-NEXT: vmovaps %zmm0, 960(%rax) 10909; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10910; AVX512BW-NEXT: vmovaps %zmm0, 896(%rax) 10911; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10912; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax) 10913; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10914; AVX512BW-NEXT: vmovaps %zmm0, 768(%rax) 10915; AVX512BW-NEXT: vmovdqa64 %zmm26, 576(%rax) 10916; AVX512BW-NEXT: vmovdqa64 %zmm29, 512(%rax) 10917; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10918; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) 10919; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10920; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) 10921; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10922; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) 10923; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10924; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) 10925; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload 10926; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) 10927; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10928; AVX512BW-NEXT: vmovaps %zmm0, (%rax) 10929; AVX512BW-NEXT: addq $2312, %rsp # imm = 0x908 10930; AVX512BW-NEXT: vzeroupper 10931; AVX512BW-NEXT: retq 10932; 10933; AVX512BW-FCP-LABEL: store_i64_stride8_vf32: 10934; AVX512BW-FCP: # %bb.0: 10935; AVX512BW-FCP-NEXT: subq $2312, %rsp # imm = 0x908 10936; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 10937; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 10938; AVX512BW-FCP-NEXT: vmovaps 128(%rdi), %zmm0 10939; AVX512BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10940; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 10941; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 10942; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 10943; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 10944; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 10945; AVX512BW-FCP-NEXT: vmovaps 192(%rdx), %zmm1 10946; AVX512BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10947; AVX512BW-FCP-NEXT: vmovaps 128(%rdx), %zmm1 10948; AVX512BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10949; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 10950; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 10951; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm23 10952; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm1 10953; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 10954; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm8 10955; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm18 10956; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 10957; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm12 10958; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm19 10959; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm25 10960; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm17 10961; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm15 10962; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm20 10963; AVX512BW-FCP-NEXT: movb $-64, %r11b 10964; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 10965; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] 10966; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10967; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 10968; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 10969; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 10970; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 10971; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 10972; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 10973; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 10974; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] 10975; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} 10976; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 10977; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10978; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] 10979; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10980; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 10981; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 10982; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 10983; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 10984; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 10985; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 10986; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 10987; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] 10988; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} 10989; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 10990; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10991; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] 10992; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10993; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 10994; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 10995; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 10996; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 10997; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 10998; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm11 10999; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 11000; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] 11001; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 11002; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11003; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] 11004; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11005; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 11006; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 11007; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 11008; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 11009; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 11010; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] 11011; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 11012; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11013; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 11014; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 11015; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 11016; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 11017; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 11018; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 11019; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 11020; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] 11021; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 11022; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 11023; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11024; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 11025; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 11026; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 11027; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 11028; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 11029; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 11030; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 11031; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] 11032; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 11033; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 11034; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11035; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 11036; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 11037; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 11038; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 11039; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 11040; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 11041; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 11042; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] 11043; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 11044; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11045; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 11046; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 11047; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] 11048; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 11049; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 11050; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] 11051; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 11052; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11053; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 11054; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 11055; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 11056; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11057; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 11058; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 11059; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 11060; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11061; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 11062; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 11063; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill 11064; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 11065; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 11066; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11067; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 11068; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 11069; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11070; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 11071; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 11072; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11073; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 11074; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11075; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 11076; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11077; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11078; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 11079; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 11080; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11081; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 11082; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 11083; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11084; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 11085; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 11086; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11087; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 11088; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11089; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 11090; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm0 11091; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 11092; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 11093; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11094; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 11095; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 11096; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11097; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 11098; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 11099; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11100; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 11101; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11102; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] 11103; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11104; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 11105; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 11106; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11107; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] 11108; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11109; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 11110; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 11111; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11112; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] 11113; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11114; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 11115; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 11116; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11117; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] 11118; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11119; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 11120; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11121; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 11122; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 11123; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 11124; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 11125; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 11126; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 11127; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11128; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 11129; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11130; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 11131; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 11132; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11133; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 11134; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 11135; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11136; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 11137; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 11138; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11139; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 11140; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11141; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 11142; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 11143; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 11144; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 11145; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 11146; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 11147; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 11148; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11149; AVX512BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm19 11150; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm0 11151; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 11152; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 11153; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 11154; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 11155; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11156; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 11157; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 11158; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 11159; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 11160; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm20 11161; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm1 11162; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 11163; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 11164; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] 11165; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 11166; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 11167; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] 11168; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 11169; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 11170; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11171; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 11172; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm4 11173; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 11174; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 11175; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 11176; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 11177; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 11178; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] 11179; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] 11180; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 11181; AVX512BW-FCP-NEXT: vmovdqa64 192(%r10), %zmm8 11182; AVX512BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm1 11183; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 11184; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 11185; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 11186; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 11187; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 11188; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 11189; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 11190; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 11191; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 11192; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] 11193; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 11194; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] 11195; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 11196; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 11197; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11198; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 11199; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 11200; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 11201; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 11202; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] 11203; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] 11204; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 11205; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11206; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11207; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 11208; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} 11209; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 11210; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11211; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11212; AVX512BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 11213; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 11214; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} 11215; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 11216; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11217; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11218; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11219; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 11220; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 11221; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11222; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11223; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} 11224; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 11225; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 11226; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 11227; AVX512BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 11228; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 11229; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 11230; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11231; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11232; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} 11233; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 11234; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 11235; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 11236; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11237; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} 11238; AVX512BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 11239; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 11240; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 11241; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 11242; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 11243; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 11244; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11245; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} 11246; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 11247; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 11248; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} 11249; AVX512BW-FCP-NEXT: vmovdqa 128(%rsi), %xmm3 11250; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 11251; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 11252; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 11253; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 11254; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 11255; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11256; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} 11257; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 11258; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 11259; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11260; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 11261; AVX512BW-FCP-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] 11262; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 11263; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} 11264; AVX512BW-FCP-NEXT: vmovdqa 192(%rsi), %xmm4 11265; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 11266; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm4 11267; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 11268; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] 11269; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 11270; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} 11271; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] 11272; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 11273; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11274; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 11275; AVX512BW-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] 11276; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} 11277; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 11278; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11279; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 11280; AVX512BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 11281; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} 11282; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 11283; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11284; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 11285; AVX512BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 11286; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 11287; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11288; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 11289; AVX512BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 11290; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 11291; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11292; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 11293; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 11294; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm1 11295; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm15 11296; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] 11297; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm21 11298; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 11299; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 11300; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] 11301; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 11302; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 11303; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 11304; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 11305; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] 11306; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 11307; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] 11308; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 11309; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 11310; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} 11311; AVX512BW-FCP-NEXT: vmovdqa 64(%rcx), %ymm14 11312; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %ymm15 11313; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %ymm21 11314; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 11315; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 11316; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 11317; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] 11318; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 11319; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 11320; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 11321; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 11322; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 11323; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 11324; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] 11325; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 11326; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 11327; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} 11328; AVX512BW-FCP-NEXT: vmovdqa 128(%rcx), %ymm14 11329; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %ymm15 11330; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 11331; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %ymm21 11332; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm22 11333; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 11334; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] 11335; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 11336; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} 11337; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 11338; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 11339; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] 11340; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 11341; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 11342; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} 11343; AVX512BW-FCP-NEXT: vmovdqa 192(%rcx), %ymm14 11344; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %ymm15 11345; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 11346; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %ymm17 11347; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 11348; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] 11349; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] 11350; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 11351; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} 11352; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 11353; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] 11354; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] 11355; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 11356; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 11357; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 1728(%rax) 11358; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 1664(%rax) 11359; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 1216(%rax) 11360; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 1152(%rax) 11361; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 704(%rax) 11362; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 640(%rax) 11363; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) 11364; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) 11365; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 1984(%rax) 11366; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 1920(%rax) 11367; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 1856(%rax) 11368; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 1792(%rax) 11369; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 1600(%rax) 11370; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 1536(%rax) 11371; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 1472(%rax) 11372; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11373; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1408(%rax) 11374; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11375; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1344(%rax) 11376; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11377; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1280(%rax) 11378; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 1088(%rax) 11379; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) 11380; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11381; AVX512BW-FCP-NEXT: vmovaps %zmm0, 960(%rax) 11382; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11383; AVX512BW-FCP-NEXT: vmovaps %zmm0, 896(%rax) 11384; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11385; AVX512BW-FCP-NEXT: vmovaps %zmm0, 832(%rax) 11386; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11387; AVX512BW-FCP-NEXT: vmovaps %zmm0, 768(%rax) 11388; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 576(%rax) 11389; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 512(%rax) 11390; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11391; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) 11392; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11393; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) 11394; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11395; AVX512BW-FCP-NEXT: vmovaps %zmm0, 320(%rax) 11396; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11397; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) 11398; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload 11399; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) 11400; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11401; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax) 11402; AVX512BW-FCP-NEXT: addq $2312, %rsp # imm = 0x908 11403; AVX512BW-FCP-NEXT: vzeroupper 11404; AVX512BW-FCP-NEXT: retq 11405; 11406; AVX512DQ-BW-LABEL: store_i64_stride8_vf32: 11407; AVX512DQ-BW: # %bb.0: 11408; AVX512DQ-BW-NEXT: subq $2312, %rsp # imm = 0x908 11409; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 11410; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 11411; AVX512DQ-BW-NEXT: vmovaps 128(%rdi), %zmm0 11412; AVX512DQ-BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11413; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm6 11414; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 11415; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm0 11416; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm5 11417; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm3 11418; AVX512DQ-BW-NEXT: vmovaps 192(%rdx), %zmm1 11419; AVX512DQ-BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11420; AVX512DQ-BW-NEXT: vmovaps 128(%rdx), %zmm1 11421; AVX512DQ-BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11422; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm7 11423; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4 11424; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm23 11425; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm1 11426; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm9 11427; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm8 11428; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm18 11429; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm16 11430; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm12 11431; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm19 11432; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm25 11433; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm17 11434; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm15 11435; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm20 11436; AVX512DQ-BW-NEXT: movb $-64, %r11b 11437; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 11438; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] 11439; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11440; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 11441; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 11442; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 11443; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 11444; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 11445; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm11 11446; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 11447; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] 11448; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} 11449; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 11450; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11451; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] 11452; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11453; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 11454; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 11455; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 11456; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 11457; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 11458; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm11 11459; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 11460; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] 11461; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} 11462; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 11463; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11464; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] 11465; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11466; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 11467; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 11468; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 11469; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 11470; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 11471; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm11 11472; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 11473; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] 11474; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 11475; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11476; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] 11477; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11478; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 11479; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 11480; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 11481; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm3 11482; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 11483; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] 11484; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 11485; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11486; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 11487; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 11488; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 11489; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 11490; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 11491; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 11492; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 11493; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] 11494; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 11495; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 11496; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11497; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 11498; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 11499; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 11500; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 11501; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 11502; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 11503; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 11504; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] 11505; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 11506; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 11507; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11508; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 11509; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 11510; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 11511; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 11512; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 11513; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm3 11514; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 11515; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] 11516; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 11517; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11518; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 11519; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 11520; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] 11521; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm3 11522; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 11523; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] 11524; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 11525; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11526; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 11527; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 11528; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 11529; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11530; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 11531; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 11532; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 11533; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11534; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 11535; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 11536; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill 11537; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 11538; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 11539; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11540; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 11541; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 11542; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11543; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 11544; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 11545; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11546; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 11547; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11548; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 11549; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11550; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11551; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 11552; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 11553; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11554; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 11555; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 11556; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11557; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 11558; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 11559; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11560; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 11561; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11562; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm1 11563; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm0 11564; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 11565; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 11566; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11567; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 11568; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 11569; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11570; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 11571; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 11572; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11573; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 11574; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11575; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] 11576; AVX512DQ-BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11577; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 11578; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 11579; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11580; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] 11581; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11582; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 11583; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 11584; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11585; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] 11586; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11587; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 11588; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 11589; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11590; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] 11591; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11592; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 11593; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11594; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 11595; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm29 11596; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 11597; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 11598; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 11599; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 11600; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11601; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 11602; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11603; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 11604; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 11605; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11606; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 11607; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 11608; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11609; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 11610; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 11611; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11612; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 11613; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11614; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm14 11615; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 11616; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm11 11617; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 11618; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm25 11619; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 11620; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 11621; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11622; AVX512DQ-BW-NEXT: vmovdqa64 128(%r10), %zmm19 11623; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm0 11624; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm28 11625; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 11626; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm1 11627; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 11628; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11629; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm13 11630; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 11631; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm16 11632; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 11633; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm20 11634; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm1 11635; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm3 11636; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 11637; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] 11638; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm10 11639; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 11640; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] 11641; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm2 11642; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 11643; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11644; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 11645; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm4 11646; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 11647; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm6 11648; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 11649; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm17 11650; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 11651; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] 11652; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] 11653; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 11654; AVX512DQ-BW-NEXT: vmovdqa64 192(%r10), %zmm8 11655; AVX512DQ-BW-NEXT: vmovdqa64 192(%rax), %zmm1 11656; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 11657; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 11658; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm15 11659; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 11660; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 11661; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 11662; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm2 11663; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm0 11664; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 11665; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] 11666; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 11667; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] 11668; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 11669; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 11670; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11671; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 11672; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 11673; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 11674; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 11675; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] 11676; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] 11677; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 11678; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11679; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11680; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 11681; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} 11682; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 11683; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11684; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11685; AVX512DQ-BW-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 11686; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 11687; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} 11688; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 11689; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11690; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11691; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11692; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 11693; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 11694; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11695; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11696; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} 11697; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm0 11698; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 11699; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 11700; AVX512DQ-BW-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 11701; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 11702; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 11703; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11704; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11705; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} 11706; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 11707; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 11708; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 11709; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11710; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} 11711; AVX512DQ-BW-NEXT: vmovdqa 64(%rsi), %xmm3 11712; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 11713; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm5 11714; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 11715; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 11716; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 11717; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11718; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} 11719; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 11720; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 11721; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} 11722; AVX512DQ-BW-NEXT: vmovdqa 128(%rsi), %xmm3 11723; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 11724; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm5 11725; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 11726; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 11727; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 11728; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11729; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} 11730; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 11731; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 11732; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11733; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 11734; AVX512DQ-BW-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] 11735; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 11736; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} 11737; AVX512DQ-BW-NEXT: vmovdqa 192(%rsi), %xmm4 11738; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 11739; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm4 11740; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 11741; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] 11742; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 11743; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} 11744; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] 11745; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 11746; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11747; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 11748; AVX512DQ-BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] 11749; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} 11750; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 11751; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11752; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 11753; AVX512DQ-BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 11754; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} 11755; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 11756; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11757; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 11758; AVX512DQ-BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 11759; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 11760; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11761; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 11762; AVX512DQ-BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 11763; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 11764; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11765; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 11766; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 11767; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm1 11768; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm15 11769; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] 11770; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %ymm21 11771; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm22 11772; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 11773; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] 11774; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 11775; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 11776; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 11777; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 11778; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] 11779; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 11780; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] 11781; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 11782; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 11783; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} 11784; AVX512DQ-BW-NEXT: vmovdqa 64(%rcx), %ymm14 11785; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %ymm15 11786; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %ymm21 11787; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm22 11788; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 11789; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 11790; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] 11791; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 11792; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 11793; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 11794; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 11795; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 11796; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 11797; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] 11798; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 11799; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 11800; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} 11801; AVX512DQ-BW-NEXT: vmovdqa 128(%rcx), %ymm14 11802; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %ymm15 11803; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 11804; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %ymm21 11805; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %ymm22 11806; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 11807; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] 11808; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 11809; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} 11810; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 11811; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 11812; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] 11813; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 11814; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 11815; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} 11816; AVX512DQ-BW-NEXT: vmovdqa 192(%rcx), %ymm14 11817; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %ymm15 11818; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 11819; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %ymm17 11820; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm19 11821; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] 11822; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] 11823; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 11824; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} 11825; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 11826; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] 11827; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] 11828; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 11829; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 11830; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 1728(%rax) 11831; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 1664(%rax) 11832; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 1216(%rax) 11833; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 1152(%rax) 11834; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 704(%rax) 11835; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 640(%rax) 11836; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%rax) 11837; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) 11838; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 1984(%rax) 11839; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 1920(%rax) 11840; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 1856(%rax) 11841; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 1792(%rax) 11842; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 1600(%rax) 11843; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 1536(%rax) 11844; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 1472(%rax) 11845; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11846; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1408(%rax) 11847; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11848; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1344(%rax) 11849; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11850; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1280(%rax) 11851; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 1088(%rax) 11852; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 1024(%rax) 11853; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11854; AVX512DQ-BW-NEXT: vmovaps %zmm0, 960(%rax) 11855; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11856; AVX512DQ-BW-NEXT: vmovaps %zmm0, 896(%rax) 11857; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11858; AVX512DQ-BW-NEXT: vmovaps %zmm0, 832(%rax) 11859; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11860; AVX512DQ-BW-NEXT: vmovaps %zmm0, 768(%rax) 11861; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 576(%rax) 11862; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 512(%rax) 11863; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11864; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%rax) 11865; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11866; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%rax) 11867; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11868; AVX512DQ-BW-NEXT: vmovaps %zmm0, 320(%rax) 11869; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11870; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%rax) 11871; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload 11872; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax) 11873; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11874; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax) 11875; AVX512DQ-BW-NEXT: addq $2312, %rsp # imm = 0x908 11876; AVX512DQ-BW-NEXT: vzeroupper 11877; AVX512DQ-BW-NEXT: retq 11878; 11879; AVX512DQ-BW-FCP-LABEL: store_i64_stride8_vf32: 11880; AVX512DQ-BW-FCP: # %bb.0: 11881; AVX512DQ-BW-FCP-NEXT: subq $2312, %rsp # imm = 0x908 11882; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 11883; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 11884; AVX512DQ-BW-FCP-NEXT: vmovaps 128(%rdi), %zmm0 11885; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11886; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 11887; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 11888; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 11889; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 11890; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 11891; AVX512DQ-BW-FCP-NEXT: vmovaps 192(%rdx), %zmm1 11892; AVX512DQ-BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11893; AVX512DQ-BW-FCP-NEXT: vmovaps 128(%rdx), %zmm1 11894; AVX512DQ-BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11895; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 11896; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 11897; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm23 11898; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm1 11899; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 11900; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm8 11901; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm18 11902; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 11903; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm12 11904; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm19 11905; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm25 11906; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm17 11907; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm15 11908; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm20 11909; AVX512DQ-BW-FCP-NEXT: movb $-64, %r11b 11910; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 11911; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] 11912; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11913; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 11914; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 11915; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 11916; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 11917; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 11918; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 11919; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 11920; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] 11921; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} 11922; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 11923; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11924; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] 11925; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11926; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 11927; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 11928; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 11929; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 11930; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 11931; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 11932; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 11933; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] 11934; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} 11935; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 11936; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11937; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] 11938; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11939; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 11940; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 11941; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 11942; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 11943; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 11944; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm11 11945; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 11946; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] 11947; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 11948; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11949; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] 11950; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11951; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 11952; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 11953; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 11954; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 11955; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 11956; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] 11957; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 11958; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11959; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 11960; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 11961; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 11962; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 11963; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 11964; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 11965; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 11966; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] 11967; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 11968; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 11969; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11970; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 11971; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 11972; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 11973; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 11974; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 11975; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 11976; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 11977; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] 11978; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 11979; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 11980; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11981; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 11982; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 11983; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 11984; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 11985; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 11986; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 11987; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 11988; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] 11989; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 11990; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11991; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 11992; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 11993; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] 11994; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 11995; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 11996; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] 11997; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 11998; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11999; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 12000; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 12001; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 12002; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12003; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 12004; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 12005; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 12006; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12007; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 12008; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 12009; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill 12010; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 12011; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 12012; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12013; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 12014; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 12015; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12016; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 12017; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 12018; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12019; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 12020; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12021; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 12022; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12023; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12024; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 12025; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 12026; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12027; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 12028; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 12029; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12030; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 12031; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 12032; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12033; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 12034; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12035; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 12036; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm0 12037; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 12038; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 12039; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12040; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 12041; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 12042; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12043; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 12044; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 12045; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12046; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 12047; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12048; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] 12049; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 12050; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 12051; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 12052; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12053; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] 12054; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 12055; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 12056; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 12057; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12058; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] 12059; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 12060; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 12061; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 12062; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12063; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] 12064; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 12065; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 12066; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12067; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 12068; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 12069; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 12070; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 12071; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 12072; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 12073; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12074; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 12075; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12076; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 12077; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 12078; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12079; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 12080; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 12081; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12082; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 12083; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 12084; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12085; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 12086; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12087; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 12088; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 12089; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 12090; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 12091; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 12092; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 12093; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 12094; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12095; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm19 12096; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm0 12097; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 12098; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 12099; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 12100; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 12101; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12102; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 12103; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 12104; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 12105; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 12106; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm20 12107; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm1 12108; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 12109; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 12110; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] 12111; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 12112; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 12113; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] 12114; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 12115; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 12116; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12117; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 12118; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm4 12119; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 12120; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 12121; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 12122; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 12123; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 12124; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] 12125; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] 12126; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 12127; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r10), %zmm8 12128; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm1 12129; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 12130; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 12131; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 12132; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 12133; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 12134; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 12135; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 12136; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 12137; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 12138; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] 12139; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 12140; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] 12141; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 12142; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 12143; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12144; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 12145; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 12146; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 12147; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 12148; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] 12149; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] 12150; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 12151; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12152; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12153; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 12154; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} 12155; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 12156; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12157; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12158; AVX512DQ-BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 12159; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 12160; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} 12161; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 12162; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12163; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12164; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12165; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 12166; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 12167; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12168; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12169; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} 12170; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 12171; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 12172; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 12173; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 12174; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 12175; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 12176; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12177; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12178; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} 12179; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 12180; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 12181; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 12182; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12183; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} 12184; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 12185; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 12186; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 12187; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 12188; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 12189; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 12190; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12191; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} 12192; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 12193; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 12194; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} 12195; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rsi), %xmm3 12196; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 12197; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 12198; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 12199; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] 12200; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 12201; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12202; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} 12203; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] 12204; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 12205; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12206; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 12207; AVX512DQ-BW-FCP-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] 12208; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 12209; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} 12210; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rsi), %xmm4 12211; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 12212; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm4 12213; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 12214; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] 12215; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 12216; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} 12217; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] 12218; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 12219; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12220; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 12221; AVX512DQ-BW-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] 12222; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} 12223; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 12224; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12225; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 12226; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 12227; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} 12228; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 12229; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12230; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 12231; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 12232; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 12233; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12234; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 12235; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] 12236; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 12237; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12238; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 12239; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 12240; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm1 12241; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm15 12242; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] 12243; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm21 12244; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 12245; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 12246; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] 12247; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 12248; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 12249; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 12250; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 12251; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] 12252; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 12253; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] 12254; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 12255; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 12256; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} 12257; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rcx), %ymm14 12258; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %ymm15 12259; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %ymm21 12260; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 12261; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 12262; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 12263; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] 12264; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 12265; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 12266; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 12267; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 12268; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 12269; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 12270; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] 12271; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 12272; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 12273; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} 12274; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rcx), %ymm14 12275; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdx), %ymm15 12276; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 12277; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %ymm21 12278; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm22 12279; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] 12280; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] 12281; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 12282; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} 12283; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 12284; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] 12285; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] 12286; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 12287; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 12288; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} 12289; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rcx), %ymm14 12290; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %ymm15 12291; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] 12292; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %ymm17 12293; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 12294; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] 12295; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] 12296; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 12297; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} 12298; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] 12299; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] 12300; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] 12301; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 12302; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 12303; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 1728(%rax) 12304; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 1664(%rax) 12305; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 1216(%rax) 12306; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 1152(%rax) 12307; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 704(%rax) 12308; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 640(%rax) 12309; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) 12310; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) 12311; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 1984(%rax) 12312; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 1920(%rax) 12313; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 1856(%rax) 12314; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 1792(%rax) 12315; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 1600(%rax) 12316; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 1536(%rax) 12317; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 1472(%rax) 12318; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12319; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1408(%rax) 12320; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12321; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1344(%rax) 12322; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12323; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1280(%rax) 12324; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 1088(%rax) 12325; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) 12326; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12327; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 960(%rax) 12328; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12329; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 896(%rax) 12330; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12331; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 832(%rax) 12332; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12333; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 768(%rax) 12334; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 576(%rax) 12335; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 512(%rax) 12336; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12337; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) 12338; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12339; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) 12340; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12341; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 320(%rax) 12342; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12343; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) 12344; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload 12345; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) 12346; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12347; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax) 12348; AVX512DQ-BW-FCP-NEXT: addq $2312, %rsp # imm = 0x908 12349; AVX512DQ-BW-FCP-NEXT: vzeroupper 12350; AVX512DQ-BW-FCP-NEXT: retq 12351 %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 12352 %in.vec1 = load <32 x i64>, ptr %in.vecptr1, align 64 12353 %in.vec2 = load <32 x i64>, ptr %in.vecptr2, align 64 12354 %in.vec3 = load <32 x i64>, ptr %in.vecptr3, align 64 12355 %in.vec4 = load <32 x i64>, ptr %in.vecptr4, align 64 12356 %in.vec5 = load <32 x i64>, ptr %in.vecptr5, align 64 12357 %in.vec6 = load <32 x i64>, ptr %in.vecptr6, align 64 12358 %in.vec7 = load <32 x i64>, ptr %in.vecptr7, align 64 12359 %1 = shufflevector <32 x i64> %in.vec0, <32 x i64> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 12360 %2 = shufflevector <32 x i64> %in.vec2, <32 x i64> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 12361 %3 = shufflevector <32 x i64> %in.vec4, <32 x i64> %in.vec5, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 12362 %4 = shufflevector <32 x i64> %in.vec6, <32 x i64> %in.vec7, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 12363 %5 = shufflevector <64 x i64> %1, <64 x i64> %2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 12364 %6 = shufflevector <64 x i64> %3, <64 x i64> %4, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 12365 %7 = shufflevector <128 x i64> %5, <128 x i64> %6, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> 12366 %interleaved.vec = shufflevector <256 x i64> %7, <256 x i64> poison, <256 x i32> <i32 0, i32 32, i32 64, i32 96, i32 128, i32 160, i32 192, i32 224, i32 1, i32 33, i32 65, i32 97, i32 129, i32 161, i32 193, i32 225, i32 2, i32 34, i32 66, i32 98, i32 130, i32 162, i32 194, i32 226, i32 3, i32 35, i32 67, i32 99, i32 131, i32 163, i32 195, i32 227, i32 4, i32 36, i32 68, i32 100, i32 132, i32 164, i32 196, i32 228, i32 5, i32 37, i32 69, i32 101, i32 133, i32 165, i32 197, i32 229, i32 6, i32 38, i32 70, i32 102, i32 134, i32 166, i32 198, i32 230, i32 7, i32 39, i32 71, i32 103, i32 135, i32 167, i32 199, i32 231, i32 8, i32 40, i32 72, i32 104, i32 136, i32 168, i32 200, i32 232, i32 9, i32 41, i32 73, i32 105, i32 137, i32 169, i32 201, i32 233, i32 10, i32 42, i32 74, i32 106, i32 138, i32 170, i32 202, i32 234, i32 11, i32 43, i32 75, i32 107, i32 139, i32 171, i32 203, i32 235, i32 12, i32 44, i32 76, i32 108, i32 140, i32 172, i32 204, i32 236, i32 13, i32 45, i32 77, i32 109, i32 141, i32 173, i32 205, i32 237, i32 14, i32 46, i32 78, i32 110, i32 142, i32 174, i32 206, i32 238, i32 15, i32 47, i32 79, i32 111, i32 143, i32 175, i32 207, i32 239, i32 16, i32 48, i32 80, i32 112, i32 144, i32 176, i32 208, i32 240, i32 17, i32 49, i32 81, i32 113, i32 145, i32 177, i32 209, i32 241, i32 18, i32 50, i32 82, i32 114, i32 146, i32 178, i32 210, i32 242, i32 19, i32 51, i32 83, i32 115, i32 147, i32 179, i32 211, i32 243, i32 20, i32 52, i32 84, i32 116, i32 148, i32 180, i32 212, i32 244, i32 21, i32 53, i32 85, i32 117, i32 149, i32 181, i32 213, i32 245, i32 22, i32 54, i32 86, i32 118, i32 150, i32 182, i32 214, i32 246, i32 23, i32 55, i32 87, i32 119, i32 151, i32 183, i32 215, i32 247, i32 24, i32 56, i32 88, i32 120, i32 152, i32 184, i32 216, i32 248, i32 25, i32 57, i32 89, i32 121, i32 153, i32 185, i32 217, i32 249, i32 26, i32 58, i32 90, i32 122, i32 154, i32 186, i32 218, i32 250, i32 27, i32 59, i32 91, i32 123, i32 155, i32 187, i32 219, i32 251, i32 28, i32 60, i32 92, i32 124, i32 156, i32 188, i32 220, i32 252, i32 29, i32 61, i32 93, i32 125, i32 157, i32 189, i32 221, i32 253, i32 30, i32 62, i32 94, i32 126, i32 158, i32 190, i32 222, i32 254, i32 31, i32 63, i32 95, i32 127, i32 159, i32 191, i32 223, i32 255> 12367 store <256 x i64> %interleaved.vec, ptr %out.vec, align 64 12368 ret void 12369} 12370 12371define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { 12372; SSE-LABEL: store_i64_stride8_vf64: 12373; SSE: # %bb.0: 12374; SSE-NEXT: subq $3736, %rsp # imm = 0xE98 12375; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 12376; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 12377; SSE-NEXT: movaps (%rdi), %xmm7 12378; SSE-NEXT: movaps 16(%rdi), %xmm8 12379; SSE-NEXT: movaps (%rsi), %xmm1 12380; SSE-NEXT: movaps 16(%rsi), %xmm0 12381; SSE-NEXT: movaps (%rdx), %xmm9 12382; SSE-NEXT: movaps 16(%rdx), %xmm10 12383; SSE-NEXT: movaps (%rcx), %xmm3 12384; SSE-NEXT: movaps 16(%rcx), %xmm2 12385; SSE-NEXT: movaps 16(%r8), %xmm12 12386; SSE-NEXT: movaps (%r8), %xmm11 12387; SSE-NEXT: movaps 16(%r9), %xmm4 12388; SSE-NEXT: movaps (%r9), %xmm5 12389; SSE-NEXT: movaps 16(%r10), %xmm14 12390; SSE-NEXT: movaps (%r10), %xmm13 12391; SSE-NEXT: movaps (%rax), %xmm6 12392; SSE-NEXT: movaps %xmm7, %xmm15 12393; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm1[0] 12394; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12395; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] 12396; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12397; SSE-NEXT: movaps %xmm9, %xmm1 12398; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] 12399; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12400; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm3[1] 12401; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12402; SSE-NEXT: movaps %xmm11, %xmm1 12403; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] 12404; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12405; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm5[1] 12406; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12407; SSE-NEXT: movaps %xmm13, %xmm1 12408; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] 12409; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12410; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] 12411; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12412; SSE-NEXT: movaps %xmm8, %xmm1 12413; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12414; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12415; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] 12416; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12417; SSE-NEXT: movaps %xmm10, %xmm0 12418; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 12419; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12420; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] 12421; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12422; SSE-NEXT: movaps %xmm12, %xmm0 12423; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] 12424; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12425; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] 12426; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12427; SSE-NEXT: movaps 16(%rax), %xmm0 12428; SSE-NEXT: movaps %xmm14, %xmm1 12429; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12430; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12431; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] 12432; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12433; SSE-NEXT: movaps 32(%rdi), %xmm2 12434; SSE-NEXT: movaps 32(%rsi), %xmm0 12435; SSE-NEXT: movaps %xmm2, %xmm1 12436; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12437; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12438; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12439; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12440; SSE-NEXT: movaps 32(%rdx), %xmm2 12441; SSE-NEXT: movaps 32(%rcx), %xmm0 12442; SSE-NEXT: movaps %xmm2, %xmm1 12443; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12444; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12445; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12446; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12447; SSE-NEXT: movaps 32(%r8), %xmm2 12448; SSE-NEXT: movaps 32(%r9), %xmm0 12449; SSE-NEXT: movaps %xmm2, %xmm1 12450; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12451; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12452; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12453; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12454; SSE-NEXT: movaps 32(%r10), %xmm2 12455; SSE-NEXT: movaps 32(%rax), %xmm0 12456; SSE-NEXT: movaps %xmm2, %xmm1 12457; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12458; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12459; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12460; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12461; SSE-NEXT: movaps 48(%rdi), %xmm2 12462; SSE-NEXT: movaps 48(%rsi), %xmm0 12463; SSE-NEXT: movaps %xmm2, %xmm1 12464; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12465; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12466; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12467; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12468; SSE-NEXT: movaps 48(%rdx), %xmm2 12469; SSE-NEXT: movaps 48(%rcx), %xmm0 12470; SSE-NEXT: movaps %xmm2, %xmm1 12471; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12472; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12473; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12474; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12475; SSE-NEXT: movaps 48(%r8), %xmm2 12476; SSE-NEXT: movaps 48(%r9), %xmm0 12477; SSE-NEXT: movaps %xmm2, %xmm1 12478; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12479; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12480; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12481; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12482; SSE-NEXT: movaps 48(%r10), %xmm2 12483; SSE-NEXT: movaps 48(%rax), %xmm0 12484; SSE-NEXT: movaps %xmm2, %xmm1 12485; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12486; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12487; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12488; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12489; SSE-NEXT: movaps 64(%rdi), %xmm2 12490; SSE-NEXT: movaps 64(%rsi), %xmm0 12491; SSE-NEXT: movaps %xmm2, %xmm1 12492; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12493; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12494; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12495; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12496; SSE-NEXT: movaps 64(%rdx), %xmm2 12497; SSE-NEXT: movaps 64(%rcx), %xmm0 12498; SSE-NEXT: movaps %xmm2, %xmm1 12499; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12500; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12501; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12502; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12503; SSE-NEXT: movaps 64(%r8), %xmm2 12504; SSE-NEXT: movaps 64(%r9), %xmm0 12505; SSE-NEXT: movaps %xmm2, %xmm1 12506; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12507; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12508; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12509; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12510; SSE-NEXT: movaps 64(%r10), %xmm2 12511; SSE-NEXT: movaps 64(%rax), %xmm0 12512; SSE-NEXT: movaps %xmm2, %xmm1 12513; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12514; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12515; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12516; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12517; SSE-NEXT: movaps 80(%rdi), %xmm2 12518; SSE-NEXT: movaps 80(%rsi), %xmm0 12519; SSE-NEXT: movaps %xmm2, %xmm1 12520; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12521; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12522; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12523; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12524; SSE-NEXT: movaps 80(%rdx), %xmm2 12525; SSE-NEXT: movaps 80(%rcx), %xmm0 12526; SSE-NEXT: movaps %xmm2, %xmm1 12527; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12528; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12529; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12530; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12531; SSE-NEXT: movaps 80(%r8), %xmm2 12532; SSE-NEXT: movaps 80(%r9), %xmm0 12533; SSE-NEXT: movaps %xmm2, %xmm1 12534; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12535; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12536; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12537; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12538; SSE-NEXT: movaps 80(%r10), %xmm2 12539; SSE-NEXT: movaps 80(%rax), %xmm0 12540; SSE-NEXT: movaps %xmm2, %xmm1 12541; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12542; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12543; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12544; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12545; SSE-NEXT: movaps 96(%rdi), %xmm2 12546; SSE-NEXT: movaps 96(%rsi), %xmm0 12547; SSE-NEXT: movaps %xmm2, %xmm1 12548; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12549; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12550; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12551; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12552; SSE-NEXT: movaps 96(%rdx), %xmm2 12553; SSE-NEXT: movaps 96(%rcx), %xmm0 12554; SSE-NEXT: movaps %xmm2, %xmm1 12555; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12556; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12557; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12558; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12559; SSE-NEXT: movaps 96(%r8), %xmm2 12560; SSE-NEXT: movaps 96(%r9), %xmm0 12561; SSE-NEXT: movaps %xmm2, %xmm1 12562; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12563; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12564; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12565; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12566; SSE-NEXT: movaps 96(%r10), %xmm2 12567; SSE-NEXT: movaps 96(%rax), %xmm0 12568; SSE-NEXT: movaps %xmm2, %xmm1 12569; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12570; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12571; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12572; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12573; SSE-NEXT: movaps 112(%rdi), %xmm2 12574; SSE-NEXT: movaps 112(%rsi), %xmm0 12575; SSE-NEXT: movaps %xmm2, %xmm1 12576; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12577; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12578; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12579; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12580; SSE-NEXT: movaps 112(%rdx), %xmm2 12581; SSE-NEXT: movaps 112(%rcx), %xmm0 12582; SSE-NEXT: movaps %xmm2, %xmm1 12583; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12584; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12585; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12586; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12587; SSE-NEXT: movaps 112(%r8), %xmm2 12588; SSE-NEXT: movaps 112(%r9), %xmm0 12589; SSE-NEXT: movaps %xmm2, %xmm1 12590; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12591; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12592; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12593; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12594; SSE-NEXT: movaps 112(%r10), %xmm2 12595; SSE-NEXT: movaps 112(%rax), %xmm0 12596; SSE-NEXT: movaps %xmm2, %xmm1 12597; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12598; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12599; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12600; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12601; SSE-NEXT: movaps 128(%rdi), %xmm2 12602; SSE-NEXT: movaps 128(%rsi), %xmm0 12603; SSE-NEXT: movaps %xmm2, %xmm1 12604; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12605; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12606; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12607; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12608; SSE-NEXT: movaps 128(%rdx), %xmm2 12609; SSE-NEXT: movaps 128(%rcx), %xmm0 12610; SSE-NEXT: movaps %xmm2, %xmm1 12611; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12612; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12613; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12614; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12615; SSE-NEXT: movaps 128(%r8), %xmm2 12616; SSE-NEXT: movaps 128(%r9), %xmm0 12617; SSE-NEXT: movaps %xmm2, %xmm1 12618; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12619; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12620; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12621; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12622; SSE-NEXT: movaps 128(%r10), %xmm2 12623; SSE-NEXT: movaps 128(%rax), %xmm0 12624; SSE-NEXT: movaps %xmm2, %xmm1 12625; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12626; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12627; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12628; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12629; SSE-NEXT: movaps 144(%rdi), %xmm2 12630; SSE-NEXT: movaps 144(%rsi), %xmm0 12631; SSE-NEXT: movaps %xmm2, %xmm1 12632; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12633; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12634; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12635; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12636; SSE-NEXT: movaps 144(%rdx), %xmm2 12637; SSE-NEXT: movaps 144(%rcx), %xmm0 12638; SSE-NEXT: movaps %xmm2, %xmm1 12639; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12640; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12641; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12642; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12643; SSE-NEXT: movaps 144(%r8), %xmm2 12644; SSE-NEXT: movaps 144(%r9), %xmm0 12645; SSE-NEXT: movaps %xmm2, %xmm1 12646; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12647; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12648; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12649; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12650; SSE-NEXT: movaps 144(%r10), %xmm2 12651; SSE-NEXT: movaps 144(%rax), %xmm0 12652; SSE-NEXT: movaps %xmm2, %xmm1 12653; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12654; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12655; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12656; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12657; SSE-NEXT: movaps 160(%rdi), %xmm2 12658; SSE-NEXT: movaps 160(%rsi), %xmm0 12659; SSE-NEXT: movaps %xmm2, %xmm1 12660; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12661; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12662; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12663; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12664; SSE-NEXT: movaps 160(%rdx), %xmm2 12665; SSE-NEXT: movaps 160(%rcx), %xmm0 12666; SSE-NEXT: movaps %xmm2, %xmm1 12667; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12668; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12669; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12670; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12671; SSE-NEXT: movaps 160(%r8), %xmm2 12672; SSE-NEXT: movaps 160(%r9), %xmm0 12673; SSE-NEXT: movaps %xmm2, %xmm1 12674; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12675; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12676; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12677; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12678; SSE-NEXT: movaps 160(%r10), %xmm2 12679; SSE-NEXT: movaps 160(%rax), %xmm0 12680; SSE-NEXT: movaps %xmm2, %xmm1 12681; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12682; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12683; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12684; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12685; SSE-NEXT: movaps 176(%rdi), %xmm2 12686; SSE-NEXT: movaps 176(%rsi), %xmm0 12687; SSE-NEXT: movaps %xmm2, %xmm1 12688; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12689; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12690; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12691; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12692; SSE-NEXT: movaps 176(%rdx), %xmm2 12693; SSE-NEXT: movaps 176(%rcx), %xmm0 12694; SSE-NEXT: movaps %xmm2, %xmm1 12695; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12696; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12697; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12698; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12699; SSE-NEXT: movaps 176(%r8), %xmm2 12700; SSE-NEXT: movaps 176(%r9), %xmm0 12701; SSE-NEXT: movaps %xmm2, %xmm1 12702; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12703; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12704; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12705; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12706; SSE-NEXT: movaps 176(%r10), %xmm2 12707; SSE-NEXT: movaps 176(%rax), %xmm0 12708; SSE-NEXT: movaps %xmm2, %xmm1 12709; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12710; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12711; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12712; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12713; SSE-NEXT: movaps 192(%rdi), %xmm2 12714; SSE-NEXT: movaps 192(%rsi), %xmm0 12715; SSE-NEXT: movaps %xmm2, %xmm1 12716; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12717; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12718; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12719; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12720; SSE-NEXT: movaps 192(%rdx), %xmm2 12721; SSE-NEXT: movaps 192(%rcx), %xmm0 12722; SSE-NEXT: movaps %xmm2, %xmm1 12723; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12724; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12725; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12726; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12727; SSE-NEXT: movaps 192(%r8), %xmm2 12728; SSE-NEXT: movaps 192(%r9), %xmm0 12729; SSE-NEXT: movaps %xmm2, %xmm1 12730; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12731; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12732; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12733; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12734; SSE-NEXT: movaps 192(%r10), %xmm2 12735; SSE-NEXT: movaps 192(%rax), %xmm0 12736; SSE-NEXT: movaps %xmm2, %xmm1 12737; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12738; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12739; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12740; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12741; SSE-NEXT: movaps 208(%rdi), %xmm2 12742; SSE-NEXT: movaps 208(%rsi), %xmm0 12743; SSE-NEXT: movaps %xmm2, %xmm1 12744; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12745; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12746; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12747; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12748; SSE-NEXT: movaps 208(%rdx), %xmm2 12749; SSE-NEXT: movaps 208(%rcx), %xmm0 12750; SSE-NEXT: movaps %xmm2, %xmm1 12751; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12752; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12753; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12754; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12755; SSE-NEXT: movaps 208(%r8), %xmm2 12756; SSE-NEXT: movaps 208(%r9), %xmm0 12757; SSE-NEXT: movaps %xmm2, %xmm1 12758; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12759; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12760; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12761; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12762; SSE-NEXT: movaps 208(%r10), %xmm2 12763; SSE-NEXT: movaps 208(%rax), %xmm0 12764; SSE-NEXT: movaps %xmm2, %xmm1 12765; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12766; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12767; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12768; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12769; SSE-NEXT: movaps 224(%rdi), %xmm2 12770; SSE-NEXT: movaps 224(%rsi), %xmm0 12771; SSE-NEXT: movaps %xmm2, %xmm1 12772; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12773; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12774; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12775; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12776; SSE-NEXT: movaps 224(%rdx), %xmm2 12777; SSE-NEXT: movaps 224(%rcx), %xmm0 12778; SSE-NEXT: movaps %xmm2, %xmm1 12779; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12780; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12781; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12782; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12783; SSE-NEXT: movaps 224(%r8), %xmm2 12784; SSE-NEXT: movaps 224(%r9), %xmm0 12785; SSE-NEXT: movaps %xmm2, %xmm1 12786; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12787; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12788; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12789; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12790; SSE-NEXT: movaps 224(%r10), %xmm2 12791; SSE-NEXT: movaps 224(%rax), %xmm0 12792; SSE-NEXT: movaps %xmm2, %xmm1 12793; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12794; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12795; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12796; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12797; SSE-NEXT: movaps 240(%rdi), %xmm2 12798; SSE-NEXT: movaps 240(%rsi), %xmm0 12799; SSE-NEXT: movaps %xmm2, %xmm1 12800; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12801; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12802; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12803; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12804; SSE-NEXT: movaps 240(%rdx), %xmm2 12805; SSE-NEXT: movaps 240(%rcx), %xmm0 12806; SSE-NEXT: movaps %xmm2, %xmm1 12807; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12808; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12809; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12810; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12811; SSE-NEXT: movaps 240(%r8), %xmm2 12812; SSE-NEXT: movaps 240(%r9), %xmm0 12813; SSE-NEXT: movaps %xmm2, %xmm1 12814; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12815; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12816; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12817; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12818; SSE-NEXT: movaps 240(%r10), %xmm2 12819; SSE-NEXT: movaps 240(%rax), %xmm0 12820; SSE-NEXT: movaps %xmm2, %xmm1 12821; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12822; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12823; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12824; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12825; SSE-NEXT: movaps 256(%rdi), %xmm2 12826; SSE-NEXT: movaps 256(%rsi), %xmm0 12827; SSE-NEXT: movaps %xmm2, %xmm1 12828; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12829; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12830; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12831; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12832; SSE-NEXT: movaps 256(%rdx), %xmm2 12833; SSE-NEXT: movaps 256(%rcx), %xmm0 12834; SSE-NEXT: movaps %xmm2, %xmm1 12835; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12836; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12837; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12838; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12839; SSE-NEXT: movaps 256(%r8), %xmm2 12840; SSE-NEXT: movaps 256(%r9), %xmm0 12841; SSE-NEXT: movaps %xmm2, %xmm1 12842; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12843; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12844; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12845; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12846; SSE-NEXT: movaps 256(%r10), %xmm2 12847; SSE-NEXT: movaps 256(%rax), %xmm0 12848; SSE-NEXT: movaps %xmm2, %xmm1 12849; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12850; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12851; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12852; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12853; SSE-NEXT: movaps 272(%rdi), %xmm2 12854; SSE-NEXT: movaps 272(%rsi), %xmm0 12855; SSE-NEXT: movaps %xmm2, %xmm1 12856; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12857; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12858; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12859; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12860; SSE-NEXT: movaps 272(%rdx), %xmm2 12861; SSE-NEXT: movaps 272(%rcx), %xmm0 12862; SSE-NEXT: movaps %xmm2, %xmm1 12863; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12864; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12865; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12866; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12867; SSE-NEXT: movaps 272(%r8), %xmm2 12868; SSE-NEXT: movaps 272(%r9), %xmm0 12869; SSE-NEXT: movaps %xmm2, %xmm1 12870; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12871; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12872; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12873; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12874; SSE-NEXT: movaps 272(%r10), %xmm2 12875; SSE-NEXT: movaps 272(%rax), %xmm0 12876; SSE-NEXT: movaps %xmm2, %xmm1 12877; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12878; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12879; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12880; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12881; SSE-NEXT: movaps 288(%rdi), %xmm2 12882; SSE-NEXT: movaps 288(%rsi), %xmm0 12883; SSE-NEXT: movaps %xmm2, %xmm1 12884; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12885; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12886; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12887; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12888; SSE-NEXT: movaps 288(%rdx), %xmm2 12889; SSE-NEXT: movaps 288(%rcx), %xmm0 12890; SSE-NEXT: movaps %xmm2, %xmm1 12891; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12892; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12893; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12894; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12895; SSE-NEXT: movaps 288(%r8), %xmm2 12896; SSE-NEXT: movaps 288(%r9), %xmm0 12897; SSE-NEXT: movaps %xmm2, %xmm1 12898; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12899; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12900; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12901; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12902; SSE-NEXT: movaps 288(%r10), %xmm2 12903; SSE-NEXT: movaps 288(%rax), %xmm0 12904; SSE-NEXT: movaps %xmm2, %xmm1 12905; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12906; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12907; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12908; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12909; SSE-NEXT: movaps 304(%rdi), %xmm2 12910; SSE-NEXT: movaps 304(%rsi), %xmm0 12911; SSE-NEXT: movaps %xmm2, %xmm1 12912; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12913; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12914; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12915; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12916; SSE-NEXT: movaps 304(%rdx), %xmm2 12917; SSE-NEXT: movaps 304(%rcx), %xmm0 12918; SSE-NEXT: movaps %xmm2, %xmm1 12919; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12920; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12921; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12922; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12923; SSE-NEXT: movaps 304(%r8), %xmm2 12924; SSE-NEXT: movaps 304(%r9), %xmm0 12925; SSE-NEXT: movaps %xmm2, %xmm1 12926; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12927; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12928; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12929; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12930; SSE-NEXT: movaps 304(%r10), %xmm2 12931; SSE-NEXT: movaps 304(%rax), %xmm0 12932; SSE-NEXT: movaps %xmm2, %xmm1 12933; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12934; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12935; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12936; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12937; SSE-NEXT: movaps 320(%rdi), %xmm2 12938; SSE-NEXT: movaps 320(%rsi), %xmm0 12939; SSE-NEXT: movaps %xmm2, %xmm1 12940; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12941; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12942; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12943; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12944; SSE-NEXT: movaps 320(%rdx), %xmm2 12945; SSE-NEXT: movaps 320(%rcx), %xmm0 12946; SSE-NEXT: movaps %xmm2, %xmm1 12947; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12948; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12949; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12950; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12951; SSE-NEXT: movaps 320(%r8), %xmm2 12952; SSE-NEXT: movaps 320(%r9), %xmm0 12953; SSE-NEXT: movaps %xmm2, %xmm1 12954; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12955; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12956; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12957; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12958; SSE-NEXT: movaps 320(%r10), %xmm2 12959; SSE-NEXT: movaps 320(%rax), %xmm0 12960; SSE-NEXT: movaps %xmm2, %xmm1 12961; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12962; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12963; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12964; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12965; SSE-NEXT: movaps 336(%rdi), %xmm2 12966; SSE-NEXT: movaps 336(%rsi), %xmm0 12967; SSE-NEXT: movaps %xmm2, %xmm1 12968; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12969; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12970; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12971; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12972; SSE-NEXT: movaps 336(%rdx), %xmm2 12973; SSE-NEXT: movaps 336(%rcx), %xmm0 12974; SSE-NEXT: movaps %xmm2, %xmm1 12975; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12976; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12977; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12978; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12979; SSE-NEXT: movaps 336(%r8), %xmm2 12980; SSE-NEXT: movaps 336(%r9), %xmm0 12981; SSE-NEXT: movaps %xmm2, %xmm1 12982; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12983; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12984; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12985; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12986; SSE-NEXT: movaps 336(%r10), %xmm2 12987; SSE-NEXT: movaps 336(%rax), %xmm0 12988; SSE-NEXT: movaps %xmm2, %xmm1 12989; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12990; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12991; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12992; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12993; SSE-NEXT: movaps 352(%rdi), %xmm2 12994; SSE-NEXT: movaps 352(%rsi), %xmm0 12995; SSE-NEXT: movaps %xmm2, %xmm1 12996; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 12997; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12998; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 12999; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13000; SSE-NEXT: movaps 352(%rdx), %xmm2 13001; SSE-NEXT: movaps 352(%rcx), %xmm0 13002; SSE-NEXT: movaps %xmm2, %xmm1 13003; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13004; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13005; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13006; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13007; SSE-NEXT: movaps 352(%r8), %xmm2 13008; SSE-NEXT: movaps 352(%r9), %xmm0 13009; SSE-NEXT: movaps %xmm2, %xmm1 13010; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13011; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13012; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13013; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13014; SSE-NEXT: movaps 352(%r10), %xmm2 13015; SSE-NEXT: movaps 352(%rax), %xmm0 13016; SSE-NEXT: movaps %xmm2, %xmm1 13017; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13018; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13019; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13020; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13021; SSE-NEXT: movaps 368(%rdi), %xmm2 13022; SSE-NEXT: movaps 368(%rsi), %xmm0 13023; SSE-NEXT: movaps %xmm2, %xmm1 13024; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13025; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13026; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13027; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13028; SSE-NEXT: movaps 368(%rdx), %xmm2 13029; SSE-NEXT: movaps 368(%rcx), %xmm0 13030; SSE-NEXT: movaps %xmm2, %xmm1 13031; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13032; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13033; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13034; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13035; SSE-NEXT: movaps 368(%r8), %xmm2 13036; SSE-NEXT: movaps 368(%r9), %xmm0 13037; SSE-NEXT: movaps %xmm2, %xmm1 13038; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13039; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13040; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13041; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13042; SSE-NEXT: movaps 368(%r10), %xmm2 13043; SSE-NEXT: movaps 368(%rax), %xmm0 13044; SSE-NEXT: movaps %xmm2, %xmm1 13045; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13046; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13047; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13048; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13049; SSE-NEXT: movaps 384(%rdi), %xmm2 13050; SSE-NEXT: movaps 384(%rsi), %xmm0 13051; SSE-NEXT: movaps %xmm2, %xmm1 13052; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13053; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13054; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13055; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13056; SSE-NEXT: movaps 384(%rdx), %xmm2 13057; SSE-NEXT: movaps 384(%rcx), %xmm0 13058; SSE-NEXT: movaps %xmm2, %xmm1 13059; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13060; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13061; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13062; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13063; SSE-NEXT: movaps 384(%r8), %xmm2 13064; SSE-NEXT: movaps 384(%r9), %xmm0 13065; SSE-NEXT: movaps %xmm2, %xmm1 13066; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13067; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13068; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13069; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13070; SSE-NEXT: movaps 384(%r10), %xmm2 13071; SSE-NEXT: movaps 384(%rax), %xmm0 13072; SSE-NEXT: movaps %xmm2, %xmm1 13073; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13074; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13075; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13076; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13077; SSE-NEXT: movaps 400(%rdi), %xmm2 13078; SSE-NEXT: movaps 400(%rsi), %xmm0 13079; SSE-NEXT: movaps %xmm2, %xmm1 13080; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13081; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13082; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13083; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13084; SSE-NEXT: movaps 400(%rdx), %xmm2 13085; SSE-NEXT: movaps 400(%rcx), %xmm0 13086; SSE-NEXT: movaps %xmm2, %xmm1 13087; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13088; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13089; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13090; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13091; SSE-NEXT: movaps 400(%r8), %xmm2 13092; SSE-NEXT: movaps 400(%r9), %xmm0 13093; SSE-NEXT: movaps %xmm2, %xmm1 13094; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13095; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13096; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13097; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13098; SSE-NEXT: movaps 400(%r10), %xmm2 13099; SSE-NEXT: movaps 400(%rax), %xmm0 13100; SSE-NEXT: movaps %xmm2, %xmm1 13101; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13102; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13103; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13104; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13105; SSE-NEXT: movaps 416(%rdi), %xmm2 13106; SSE-NEXT: movaps 416(%rsi), %xmm0 13107; SSE-NEXT: movaps %xmm2, %xmm1 13108; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13109; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13110; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13111; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13112; SSE-NEXT: movaps 416(%rdx), %xmm2 13113; SSE-NEXT: movaps 416(%rcx), %xmm0 13114; SSE-NEXT: movaps %xmm2, %xmm1 13115; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13116; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13117; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13118; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13119; SSE-NEXT: movaps 416(%r8), %xmm2 13120; SSE-NEXT: movaps 416(%r9), %xmm0 13121; SSE-NEXT: movaps %xmm2, %xmm1 13122; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13123; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13124; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13125; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13126; SSE-NEXT: movaps 416(%r10), %xmm2 13127; SSE-NEXT: movaps 416(%rax), %xmm0 13128; SSE-NEXT: movaps %xmm2, %xmm1 13129; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13130; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13131; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13132; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13133; SSE-NEXT: movaps 432(%rdi), %xmm2 13134; SSE-NEXT: movaps 432(%rsi), %xmm0 13135; SSE-NEXT: movaps %xmm2, %xmm1 13136; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13137; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13138; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13139; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13140; SSE-NEXT: movaps 432(%rdx), %xmm2 13141; SSE-NEXT: movaps 432(%rcx), %xmm0 13142; SSE-NEXT: movaps %xmm2, %xmm1 13143; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13144; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13145; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13146; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13147; SSE-NEXT: movaps 432(%r8), %xmm2 13148; SSE-NEXT: movaps 432(%r9), %xmm0 13149; SSE-NEXT: movaps %xmm2, %xmm1 13150; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13151; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13152; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13153; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13154; SSE-NEXT: movaps 432(%r10), %xmm2 13155; SSE-NEXT: movaps 432(%rax), %xmm0 13156; SSE-NEXT: movaps %xmm2, %xmm1 13157; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13158; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13159; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13160; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13161; SSE-NEXT: movaps 448(%rdi), %xmm2 13162; SSE-NEXT: movaps 448(%rsi), %xmm0 13163; SSE-NEXT: movaps %xmm2, %xmm1 13164; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13165; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13166; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13167; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13168; SSE-NEXT: movaps 448(%rdx), %xmm2 13169; SSE-NEXT: movaps 448(%rcx), %xmm0 13170; SSE-NEXT: movaps %xmm2, %xmm1 13171; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13172; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13173; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13174; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13175; SSE-NEXT: movaps 448(%r8), %xmm2 13176; SSE-NEXT: movaps 448(%r9), %xmm0 13177; SSE-NEXT: movaps %xmm2, %xmm1 13178; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13179; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13180; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13181; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13182; SSE-NEXT: movaps 448(%r10), %xmm2 13183; SSE-NEXT: movaps 448(%rax), %xmm0 13184; SSE-NEXT: movaps %xmm2, %xmm1 13185; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13186; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13187; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13188; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13189; SSE-NEXT: movaps 464(%rdi), %xmm2 13190; SSE-NEXT: movaps 464(%rsi), %xmm0 13191; SSE-NEXT: movaps %xmm2, %xmm1 13192; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13193; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill 13194; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13195; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13196; SSE-NEXT: movaps 464(%rdx), %xmm2 13197; SSE-NEXT: movaps 464(%rcx), %xmm0 13198; SSE-NEXT: movaps %xmm2, %xmm1 13199; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13200; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13201; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13202; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13203; SSE-NEXT: movaps 464(%r8), %xmm2 13204; SSE-NEXT: movaps 464(%r9), %xmm0 13205; SSE-NEXT: movaps %xmm2, %xmm1 13206; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13207; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13208; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13209; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13210; SSE-NEXT: movaps 464(%r10), %xmm2 13211; SSE-NEXT: movaps 464(%rax), %xmm0 13212; SSE-NEXT: movaps %xmm2, %xmm1 13213; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13214; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13215; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 13216; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13217; SSE-NEXT: movaps 480(%rdi), %xmm13 13218; SSE-NEXT: movaps 480(%rsi), %xmm0 13219; SSE-NEXT: movaps %xmm13, %xmm1 13220; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 13221; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13222; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] 13223; SSE-NEXT: movaps 480(%rdx), %xmm10 13224; SSE-NEXT: movaps 480(%rcx), %xmm0 13225; SSE-NEXT: movaps %xmm10, %xmm15 13226; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] 13227; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] 13228; SSE-NEXT: movaps 480(%r8), %xmm9 13229; SSE-NEXT: movaps 480(%r9), %xmm0 13230; SSE-NEXT: movaps %xmm9, %xmm14 13231; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] 13232; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] 13233; SSE-NEXT: movaps 480(%r10), %xmm11 13234; SSE-NEXT: movaps 480(%rax), %xmm1 13235; SSE-NEXT: movaps %xmm11, %xmm12 13236; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] 13237; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] 13238; SSE-NEXT: movaps 496(%rdi), %xmm7 13239; SSE-NEXT: movaps 496(%rsi), %xmm0 13240; SSE-NEXT: movaps %xmm7, %xmm8 13241; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] 13242; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] 13243; SSE-NEXT: movaps 496(%rdx), %xmm5 13244; SSE-NEXT: movaps 496(%rcx), %xmm1 13245; SSE-NEXT: movaps %xmm5, %xmm6 13246; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] 13247; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 13248; SSE-NEXT: movaps 496(%r8), %xmm1 13249; SSE-NEXT: movaps 496(%r9), %xmm2 13250; SSE-NEXT: movaps %xmm1, %xmm4 13251; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] 13252; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 13253; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx 13254; SSE-NEXT: movaps 496(%r10), %xmm2 13255; SSE-NEXT: movaps 496(%rax), %xmm3 13256; SSE-NEXT: movaps %xmm2, %xmm0 13257; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 13258; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] 13259; SSE-NEXT: movaps %xmm2, 4080(%rcx) 13260; SSE-NEXT: movaps %xmm1, 4064(%rcx) 13261; SSE-NEXT: movaps %xmm5, 4048(%rcx) 13262; SSE-NEXT: movaps %xmm7, 4032(%rcx) 13263; SSE-NEXT: movaps %xmm0, 4016(%rcx) 13264; SSE-NEXT: movaps %xmm4, 4000(%rcx) 13265; SSE-NEXT: movaps %xmm6, 3984(%rcx) 13266; SSE-NEXT: movaps %xmm8, 3968(%rcx) 13267; SSE-NEXT: movaps %xmm11, 3952(%rcx) 13268; SSE-NEXT: movaps %xmm9, 3936(%rcx) 13269; SSE-NEXT: movaps %xmm10, 3920(%rcx) 13270; SSE-NEXT: movaps %xmm13, 3904(%rcx) 13271; SSE-NEXT: movaps %xmm12, 3888(%rcx) 13272; SSE-NEXT: movaps %xmm14, 3872(%rcx) 13273; SSE-NEXT: movaps %xmm15, 3856(%rcx) 13274; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13275; SSE-NEXT: movaps %xmm0, 3840(%rcx) 13276; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13277; SSE-NEXT: movaps %xmm0, 3824(%rcx) 13278; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13279; SSE-NEXT: movaps %xmm0, 3808(%rcx) 13280; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13281; SSE-NEXT: movaps %xmm0, 3792(%rcx) 13282; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13283; SSE-NEXT: movaps %xmm0, 3776(%rcx) 13284; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13285; SSE-NEXT: movaps %xmm0, 3760(%rcx) 13286; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13287; SSE-NEXT: movaps %xmm0, 3744(%rcx) 13288; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13289; SSE-NEXT: movaps %xmm0, 3728(%rcx) 13290; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 13291; SSE-NEXT: movaps %xmm0, 3712(%rcx) 13292; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13293; SSE-NEXT: movaps %xmm0, 3696(%rcx) 13294; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13295; SSE-NEXT: movaps %xmm0, 3680(%rcx) 13296; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13297; SSE-NEXT: movaps %xmm0, 3664(%rcx) 13298; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13299; SSE-NEXT: movaps %xmm0, 3648(%rcx) 13300; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13301; SSE-NEXT: movaps %xmm0, 3632(%rcx) 13302; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13303; SSE-NEXT: movaps %xmm0, 3616(%rcx) 13304; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13305; SSE-NEXT: movaps %xmm0, 3600(%rcx) 13306; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13307; SSE-NEXT: movaps %xmm0, 3584(%rcx) 13308; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13309; SSE-NEXT: movaps %xmm0, 3568(%rcx) 13310; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13311; SSE-NEXT: movaps %xmm0, 3552(%rcx) 13312; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13313; SSE-NEXT: movaps %xmm0, 3536(%rcx) 13314; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13315; SSE-NEXT: movaps %xmm0, 3520(%rcx) 13316; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13317; SSE-NEXT: movaps %xmm0, 3504(%rcx) 13318; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13319; SSE-NEXT: movaps %xmm0, 3488(%rcx) 13320; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13321; SSE-NEXT: movaps %xmm0, 3472(%rcx) 13322; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13323; SSE-NEXT: movaps %xmm0, 3456(%rcx) 13324; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13325; SSE-NEXT: movaps %xmm0, 3440(%rcx) 13326; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13327; SSE-NEXT: movaps %xmm0, 3424(%rcx) 13328; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13329; SSE-NEXT: movaps %xmm0, 3408(%rcx) 13330; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13331; SSE-NEXT: movaps %xmm0, 3392(%rcx) 13332; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13333; SSE-NEXT: movaps %xmm0, 3376(%rcx) 13334; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13335; SSE-NEXT: movaps %xmm0, 3360(%rcx) 13336; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13337; SSE-NEXT: movaps %xmm0, 3344(%rcx) 13338; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13339; SSE-NEXT: movaps %xmm0, 3328(%rcx) 13340; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13341; SSE-NEXT: movaps %xmm0, 3312(%rcx) 13342; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13343; SSE-NEXT: movaps %xmm0, 3296(%rcx) 13344; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13345; SSE-NEXT: movaps %xmm0, 3280(%rcx) 13346; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13347; SSE-NEXT: movaps %xmm0, 3264(%rcx) 13348; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13349; SSE-NEXT: movaps %xmm0, 3248(%rcx) 13350; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13351; SSE-NEXT: movaps %xmm0, 3232(%rcx) 13352; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13353; SSE-NEXT: movaps %xmm0, 3216(%rcx) 13354; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13355; SSE-NEXT: movaps %xmm0, 3200(%rcx) 13356; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13357; SSE-NEXT: movaps %xmm0, 3184(%rcx) 13358; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13359; SSE-NEXT: movaps %xmm0, 3168(%rcx) 13360; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13361; SSE-NEXT: movaps %xmm0, 3152(%rcx) 13362; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13363; SSE-NEXT: movaps %xmm0, 3136(%rcx) 13364; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13365; SSE-NEXT: movaps %xmm0, 3120(%rcx) 13366; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13367; SSE-NEXT: movaps %xmm0, 3104(%rcx) 13368; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13369; SSE-NEXT: movaps %xmm0, 3088(%rcx) 13370; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13371; SSE-NEXT: movaps %xmm0, 3072(%rcx) 13372; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13373; SSE-NEXT: movaps %xmm0, 3056(%rcx) 13374; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13375; SSE-NEXT: movaps %xmm0, 3040(%rcx) 13376; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13377; SSE-NEXT: movaps %xmm0, 3024(%rcx) 13378; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13379; SSE-NEXT: movaps %xmm0, 3008(%rcx) 13380; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13381; SSE-NEXT: movaps %xmm0, 2992(%rcx) 13382; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13383; SSE-NEXT: movaps %xmm0, 2976(%rcx) 13384; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13385; SSE-NEXT: movaps %xmm0, 2960(%rcx) 13386; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13387; SSE-NEXT: movaps %xmm0, 2944(%rcx) 13388; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13389; SSE-NEXT: movaps %xmm0, 2928(%rcx) 13390; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13391; SSE-NEXT: movaps %xmm0, 2912(%rcx) 13392; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13393; SSE-NEXT: movaps %xmm0, 2896(%rcx) 13394; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13395; SSE-NEXT: movaps %xmm0, 2880(%rcx) 13396; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13397; SSE-NEXT: movaps %xmm0, 2864(%rcx) 13398; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13399; SSE-NEXT: movaps %xmm0, 2848(%rcx) 13400; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13401; SSE-NEXT: movaps %xmm0, 2832(%rcx) 13402; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13403; SSE-NEXT: movaps %xmm0, 2816(%rcx) 13404; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13405; SSE-NEXT: movaps %xmm0, 2800(%rcx) 13406; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13407; SSE-NEXT: movaps %xmm0, 2784(%rcx) 13408; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13409; SSE-NEXT: movaps %xmm0, 2768(%rcx) 13410; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13411; SSE-NEXT: movaps %xmm0, 2752(%rcx) 13412; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13413; SSE-NEXT: movaps %xmm0, 2736(%rcx) 13414; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13415; SSE-NEXT: movaps %xmm0, 2720(%rcx) 13416; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13417; SSE-NEXT: movaps %xmm0, 2704(%rcx) 13418; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13419; SSE-NEXT: movaps %xmm0, 2688(%rcx) 13420; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13421; SSE-NEXT: movaps %xmm0, 2672(%rcx) 13422; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13423; SSE-NEXT: movaps %xmm0, 2656(%rcx) 13424; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13425; SSE-NEXT: movaps %xmm0, 2640(%rcx) 13426; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13427; SSE-NEXT: movaps %xmm0, 2624(%rcx) 13428; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13429; SSE-NEXT: movaps %xmm0, 2608(%rcx) 13430; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13431; SSE-NEXT: movaps %xmm0, 2592(%rcx) 13432; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13433; SSE-NEXT: movaps %xmm0, 2576(%rcx) 13434; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13435; SSE-NEXT: movaps %xmm0, 2560(%rcx) 13436; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13437; SSE-NEXT: movaps %xmm0, 2544(%rcx) 13438; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13439; SSE-NEXT: movaps %xmm0, 2528(%rcx) 13440; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13441; SSE-NEXT: movaps %xmm0, 2512(%rcx) 13442; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13443; SSE-NEXT: movaps %xmm0, 2496(%rcx) 13444; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13445; SSE-NEXT: movaps %xmm0, 2480(%rcx) 13446; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13447; SSE-NEXT: movaps %xmm0, 2464(%rcx) 13448; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13449; SSE-NEXT: movaps %xmm0, 2448(%rcx) 13450; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13451; SSE-NEXT: movaps %xmm0, 2432(%rcx) 13452; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13453; SSE-NEXT: movaps %xmm0, 2416(%rcx) 13454; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13455; SSE-NEXT: movaps %xmm0, 2400(%rcx) 13456; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13457; SSE-NEXT: movaps %xmm0, 2384(%rcx) 13458; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13459; SSE-NEXT: movaps %xmm0, 2368(%rcx) 13460; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13461; SSE-NEXT: movaps %xmm0, 2352(%rcx) 13462; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13463; SSE-NEXT: movaps %xmm0, 2336(%rcx) 13464; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13465; SSE-NEXT: movaps %xmm0, 2320(%rcx) 13466; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13467; SSE-NEXT: movaps %xmm0, 2304(%rcx) 13468; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13469; SSE-NEXT: movaps %xmm0, 2288(%rcx) 13470; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13471; SSE-NEXT: movaps %xmm0, 2272(%rcx) 13472; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13473; SSE-NEXT: movaps %xmm0, 2256(%rcx) 13474; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13475; SSE-NEXT: movaps %xmm0, 2240(%rcx) 13476; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13477; SSE-NEXT: movaps %xmm0, 2224(%rcx) 13478; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13479; SSE-NEXT: movaps %xmm0, 2208(%rcx) 13480; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13481; SSE-NEXT: movaps %xmm0, 2192(%rcx) 13482; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13483; SSE-NEXT: movaps %xmm0, 2176(%rcx) 13484; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13485; SSE-NEXT: movaps %xmm0, 2160(%rcx) 13486; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13487; SSE-NEXT: movaps %xmm0, 2144(%rcx) 13488; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13489; SSE-NEXT: movaps %xmm0, 2128(%rcx) 13490; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13491; SSE-NEXT: movaps %xmm0, 2112(%rcx) 13492; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13493; SSE-NEXT: movaps %xmm0, 2096(%rcx) 13494; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13495; SSE-NEXT: movaps %xmm0, 2080(%rcx) 13496; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13497; SSE-NEXT: movaps %xmm0, 2064(%rcx) 13498; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13499; SSE-NEXT: movaps %xmm0, 2048(%rcx) 13500; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13501; SSE-NEXT: movaps %xmm0, 2032(%rcx) 13502; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13503; SSE-NEXT: movaps %xmm0, 2016(%rcx) 13504; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13505; SSE-NEXT: movaps %xmm0, 2000(%rcx) 13506; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13507; SSE-NEXT: movaps %xmm0, 1984(%rcx) 13508; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13509; SSE-NEXT: movaps %xmm0, 1968(%rcx) 13510; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13511; SSE-NEXT: movaps %xmm0, 1952(%rcx) 13512; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13513; SSE-NEXT: movaps %xmm0, 1936(%rcx) 13514; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13515; SSE-NEXT: movaps %xmm0, 1920(%rcx) 13516; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13517; SSE-NEXT: movaps %xmm0, 1904(%rcx) 13518; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13519; SSE-NEXT: movaps %xmm0, 1888(%rcx) 13520; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13521; SSE-NEXT: movaps %xmm0, 1872(%rcx) 13522; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13523; SSE-NEXT: movaps %xmm0, 1856(%rcx) 13524; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13525; SSE-NEXT: movaps %xmm0, 1840(%rcx) 13526; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13527; SSE-NEXT: movaps %xmm0, 1824(%rcx) 13528; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13529; SSE-NEXT: movaps %xmm0, 1808(%rcx) 13530; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13531; SSE-NEXT: movaps %xmm0, 1792(%rcx) 13532; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13533; SSE-NEXT: movaps %xmm0, 1776(%rcx) 13534; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13535; SSE-NEXT: movaps %xmm0, 1760(%rcx) 13536; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13537; SSE-NEXT: movaps %xmm0, 1744(%rcx) 13538; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13539; SSE-NEXT: movaps %xmm0, 1728(%rcx) 13540; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13541; SSE-NEXT: movaps %xmm0, 1712(%rcx) 13542; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13543; SSE-NEXT: movaps %xmm0, 1696(%rcx) 13544; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13545; SSE-NEXT: movaps %xmm0, 1680(%rcx) 13546; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13547; SSE-NEXT: movaps %xmm0, 1664(%rcx) 13548; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13549; SSE-NEXT: movaps %xmm0, 1648(%rcx) 13550; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13551; SSE-NEXT: movaps %xmm0, 1632(%rcx) 13552; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13553; SSE-NEXT: movaps %xmm0, 1616(%rcx) 13554; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13555; SSE-NEXT: movaps %xmm0, 1600(%rcx) 13556; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13557; SSE-NEXT: movaps %xmm0, 1584(%rcx) 13558; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13559; SSE-NEXT: movaps %xmm0, 1568(%rcx) 13560; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13561; SSE-NEXT: movaps %xmm0, 1552(%rcx) 13562; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13563; SSE-NEXT: movaps %xmm0, 1536(%rcx) 13564; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13565; SSE-NEXT: movaps %xmm0, 1520(%rcx) 13566; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13567; SSE-NEXT: movaps %xmm0, 1504(%rcx) 13568; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13569; SSE-NEXT: movaps %xmm0, 1488(%rcx) 13570; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13571; SSE-NEXT: movaps %xmm0, 1472(%rcx) 13572; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13573; SSE-NEXT: movaps %xmm0, 1456(%rcx) 13574; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13575; SSE-NEXT: movaps %xmm0, 1440(%rcx) 13576; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13577; SSE-NEXT: movaps %xmm0, 1424(%rcx) 13578; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13579; SSE-NEXT: movaps %xmm0, 1408(%rcx) 13580; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13581; SSE-NEXT: movaps %xmm0, 1392(%rcx) 13582; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13583; SSE-NEXT: movaps %xmm0, 1376(%rcx) 13584; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13585; SSE-NEXT: movaps %xmm0, 1360(%rcx) 13586; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13587; SSE-NEXT: movaps %xmm0, 1344(%rcx) 13588; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13589; SSE-NEXT: movaps %xmm0, 1328(%rcx) 13590; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13591; SSE-NEXT: movaps %xmm0, 1312(%rcx) 13592; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13593; SSE-NEXT: movaps %xmm0, 1296(%rcx) 13594; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13595; SSE-NEXT: movaps %xmm0, 1280(%rcx) 13596; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13597; SSE-NEXT: movaps %xmm0, 1264(%rcx) 13598; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13599; SSE-NEXT: movaps %xmm0, 1248(%rcx) 13600; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13601; SSE-NEXT: movaps %xmm0, 1232(%rcx) 13602; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13603; SSE-NEXT: movaps %xmm0, 1216(%rcx) 13604; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13605; SSE-NEXT: movaps %xmm0, 1200(%rcx) 13606; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13607; SSE-NEXT: movaps %xmm0, 1184(%rcx) 13608; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13609; SSE-NEXT: movaps %xmm0, 1168(%rcx) 13610; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13611; SSE-NEXT: movaps %xmm0, 1152(%rcx) 13612; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13613; SSE-NEXT: movaps %xmm0, 1136(%rcx) 13614; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13615; SSE-NEXT: movaps %xmm0, 1120(%rcx) 13616; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13617; SSE-NEXT: movaps %xmm0, 1104(%rcx) 13618; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13619; SSE-NEXT: movaps %xmm0, 1088(%rcx) 13620; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13621; SSE-NEXT: movaps %xmm0, 1072(%rcx) 13622; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13623; SSE-NEXT: movaps %xmm0, 1056(%rcx) 13624; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13625; SSE-NEXT: movaps %xmm0, 1040(%rcx) 13626; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13627; SSE-NEXT: movaps %xmm0, 1024(%rcx) 13628; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13629; SSE-NEXT: movaps %xmm0, 1008(%rcx) 13630; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13631; SSE-NEXT: movaps %xmm0, 992(%rcx) 13632; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13633; SSE-NEXT: movaps %xmm0, 976(%rcx) 13634; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13635; SSE-NEXT: movaps %xmm0, 960(%rcx) 13636; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13637; SSE-NEXT: movaps %xmm0, 944(%rcx) 13638; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13639; SSE-NEXT: movaps %xmm0, 928(%rcx) 13640; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13641; SSE-NEXT: movaps %xmm0, 912(%rcx) 13642; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13643; SSE-NEXT: movaps %xmm0, 896(%rcx) 13644; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13645; SSE-NEXT: movaps %xmm0, 880(%rcx) 13646; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13647; SSE-NEXT: movaps %xmm0, 864(%rcx) 13648; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13649; SSE-NEXT: movaps %xmm0, 848(%rcx) 13650; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13651; SSE-NEXT: movaps %xmm0, 832(%rcx) 13652; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13653; SSE-NEXT: movaps %xmm0, 816(%rcx) 13654; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13655; SSE-NEXT: movaps %xmm0, 800(%rcx) 13656; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13657; SSE-NEXT: movaps %xmm0, 784(%rcx) 13658; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13659; SSE-NEXT: movaps %xmm0, 768(%rcx) 13660; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13661; SSE-NEXT: movaps %xmm0, 752(%rcx) 13662; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13663; SSE-NEXT: movaps %xmm0, 736(%rcx) 13664; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13665; SSE-NEXT: movaps %xmm0, 720(%rcx) 13666; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13667; SSE-NEXT: movaps %xmm0, 704(%rcx) 13668; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13669; SSE-NEXT: movaps %xmm0, 688(%rcx) 13670; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13671; SSE-NEXT: movaps %xmm0, 672(%rcx) 13672; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13673; SSE-NEXT: movaps %xmm0, 656(%rcx) 13674; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13675; SSE-NEXT: movaps %xmm0, 640(%rcx) 13676; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13677; SSE-NEXT: movaps %xmm0, 624(%rcx) 13678; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13679; SSE-NEXT: movaps %xmm0, 608(%rcx) 13680; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13681; SSE-NEXT: movaps %xmm0, 592(%rcx) 13682; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13683; SSE-NEXT: movaps %xmm0, 576(%rcx) 13684; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13685; SSE-NEXT: movaps %xmm0, 560(%rcx) 13686; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13687; SSE-NEXT: movaps %xmm0, 544(%rcx) 13688; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13689; SSE-NEXT: movaps %xmm0, 528(%rcx) 13690; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13691; SSE-NEXT: movaps %xmm0, 512(%rcx) 13692; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13693; SSE-NEXT: movaps %xmm0, 496(%rcx) 13694; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13695; SSE-NEXT: movaps %xmm0, 480(%rcx) 13696; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13697; SSE-NEXT: movaps %xmm0, 464(%rcx) 13698; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13699; SSE-NEXT: movaps %xmm0, 448(%rcx) 13700; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13701; SSE-NEXT: movaps %xmm0, 432(%rcx) 13702; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13703; SSE-NEXT: movaps %xmm0, 416(%rcx) 13704; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13705; SSE-NEXT: movaps %xmm0, 400(%rcx) 13706; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13707; SSE-NEXT: movaps %xmm0, 384(%rcx) 13708; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13709; SSE-NEXT: movaps %xmm0, 368(%rcx) 13710; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13711; SSE-NEXT: movaps %xmm0, 352(%rcx) 13712; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13713; SSE-NEXT: movaps %xmm0, 336(%rcx) 13714; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13715; SSE-NEXT: movaps %xmm0, 320(%rcx) 13716; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13717; SSE-NEXT: movaps %xmm0, 304(%rcx) 13718; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13719; SSE-NEXT: movaps %xmm0, 288(%rcx) 13720; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13721; SSE-NEXT: movaps %xmm0, 272(%rcx) 13722; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13723; SSE-NEXT: movaps %xmm0, 256(%rcx) 13724; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13725; SSE-NEXT: movaps %xmm0, 240(%rcx) 13726; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13727; SSE-NEXT: movaps %xmm0, 224(%rcx) 13728; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13729; SSE-NEXT: movaps %xmm0, 208(%rcx) 13730; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13731; SSE-NEXT: movaps %xmm0, 192(%rcx) 13732; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13733; SSE-NEXT: movaps %xmm0, 176(%rcx) 13734; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13735; SSE-NEXT: movaps %xmm0, 160(%rcx) 13736; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13737; SSE-NEXT: movaps %xmm0, 144(%rcx) 13738; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13739; SSE-NEXT: movaps %xmm0, 128(%rcx) 13740; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13741; SSE-NEXT: movaps %xmm0, 112(%rcx) 13742; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13743; SSE-NEXT: movaps %xmm0, 96(%rcx) 13744; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13745; SSE-NEXT: movaps %xmm0, 80(%rcx) 13746; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13747; SSE-NEXT: movaps %xmm0, 64(%rcx) 13748; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13749; SSE-NEXT: movaps %xmm0, 48(%rcx) 13750; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13751; SSE-NEXT: movaps %xmm0, 32(%rcx) 13752; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13753; SSE-NEXT: movaps %xmm0, 16(%rcx) 13754; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13755; SSE-NEXT: movaps %xmm0, (%rcx) 13756; SSE-NEXT: addq $3736, %rsp # imm = 0xE98 13757; SSE-NEXT: retq 13758; 13759; AVX-LABEL: store_i64_stride8_vf64: 13760; AVX: # %bb.0: 13761; AVX-NEXT: subq $3784, %rsp # imm = 0xEC8 13762; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 13763; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 13764; AVX-NEXT: vmovaps (%rsi), %xmm0 13765; AVX-NEXT: vmovaps (%rdi), %xmm1 13766; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 13767; AVX-NEXT: vmovaps (%rcx), %xmm3 13768; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 13769; AVX-NEXT: vinsertf128 $1, (%rdx), %ymm2, %ymm2 13770; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] 13771; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13772; AVX-NEXT: vmovaps (%r9), %xmm2 13773; AVX-NEXT: vmovaps (%r8), %xmm4 13774; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] 13775; AVX-NEXT: vmovaps (%rax), %xmm6 13776; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 13777; AVX-NEXT: vinsertf128 $1, (%r10), %ymm5, %ymm5 13778; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] 13779; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13780; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 13781; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 13782; AVX-NEXT: vbroadcastsd 8(%rdx), %ymm3 13783; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 13784; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 13785; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13786; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] 13787; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 13788; AVX-NEXT: vbroadcastsd 8(%r10), %ymm2 13789; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 13790; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 13791; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13792; AVX-NEXT: vmovaps 32(%rsi), %xmm4 13793; AVX-NEXT: vmovaps 32(%rdi), %xmm5 13794; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm4[0] 13795; AVX-NEXT: vmovaps 32(%rcx), %xmm6 13796; AVX-NEXT: vmovaps 64(%rcx), %xmm0 13797; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm2 13798; AVX-NEXT: vinsertf128 $1, 32(%rdx), %ymm1, %ymm1 13799; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[2] 13800; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13801; AVX-NEXT: vmovaps 32(%r9), %xmm7 13802; AVX-NEXT: vmovaps 64(%r9), %xmm1 13803; AVX-NEXT: vmovaps 32(%r8), %xmm8 13804; AVX-NEXT: vmovaps 64(%r8), %xmm2 13805; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm8[0],xmm7[0] 13806; AVX-NEXT: vmovaps 32(%rax), %xmm9 13807; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm10 13808; AVX-NEXT: vinsertf128 $1, 32(%r10), %ymm3, %ymm3 13809; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2],ymm10[2] 13810; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13811; AVX-NEXT: vmovaps 64(%rax), %xmm3 13812; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] 13813; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm5 13814; AVX-NEXT: vbroadcastsd 40(%rdx), %ymm6 13815; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] 13816; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] 13817; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13818; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1] 13819; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm5 13820; AVX-NEXT: vbroadcastsd 40(%r10), %ymm6 13821; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] 13822; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] 13823; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13824; AVX-NEXT: vmovaps 64(%rsi), %xmm4 13825; AVX-NEXT: vmovaps 64(%rdi), %xmm5 13826; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] 13827; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm7 13828; AVX-NEXT: vinsertf128 $1, 64(%rdx), %ymm6, %ymm6 13829; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] 13830; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13831; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0] 13832; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm7 13833; AVX-NEXT: vinsertf128 $1, 64(%r10), %ymm6, %ymm6 13834; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] 13835; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13836; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] 13837; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 13838; AVX-NEXT: vbroadcastsd 72(%rdx), %ymm5 13839; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] 13840; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] 13841; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13842; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm1[1] 13843; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 13844; AVX-NEXT: vbroadcastsd 72(%r10), %ymm2 13845; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 13846; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 13847; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13848; AVX-NEXT: vmovaps 96(%rsi), %xmm0 13849; AVX-NEXT: vmovaps 96(%rdi), %xmm1 13850; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 13851; AVX-NEXT: vmovaps 96(%rcx), %xmm3 13852; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 13853; AVX-NEXT: vinsertf128 $1, 96(%rdx), %ymm2, %ymm2 13854; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] 13855; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13856; AVX-NEXT: vmovaps 96(%r9), %xmm2 13857; AVX-NEXT: vmovaps 96(%r8), %xmm4 13858; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] 13859; AVX-NEXT: vmovaps 96(%rax), %xmm6 13860; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 13861; AVX-NEXT: vinsertf128 $1, 96(%r10), %ymm5, %ymm5 13862; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] 13863; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13864; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 13865; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 13866; AVX-NEXT: vbroadcastsd 104(%rdx), %ymm3 13867; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 13868; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 13869; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13870; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] 13871; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 13872; AVX-NEXT: vbroadcastsd 104(%r10), %ymm2 13873; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 13874; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 13875; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13876; AVX-NEXT: vmovaps 128(%rsi), %xmm0 13877; AVX-NEXT: vmovaps 128(%rdi), %xmm1 13878; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 13879; AVX-NEXT: vmovaps 128(%rcx), %xmm3 13880; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 13881; AVX-NEXT: vinsertf128 $1, 128(%rdx), %ymm2, %ymm2 13882; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] 13883; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13884; AVX-NEXT: vmovaps 128(%r9), %xmm2 13885; AVX-NEXT: vmovaps 128(%r8), %xmm4 13886; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] 13887; AVX-NEXT: vmovaps 128(%rax), %xmm6 13888; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 13889; AVX-NEXT: vinsertf128 $1, 128(%r10), %ymm5, %ymm5 13890; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] 13891; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13892; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 13893; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 13894; AVX-NEXT: vbroadcastsd 136(%rdx), %ymm3 13895; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 13896; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 13897; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13898; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] 13899; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 13900; AVX-NEXT: vbroadcastsd 136(%r10), %ymm2 13901; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 13902; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 13903; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13904; AVX-NEXT: vmovaps 160(%rcx), %xmm0 13905; AVX-NEXT: vmovaps 160(%rsi), %xmm1 13906; AVX-NEXT: vmovaps 160(%rdi), %xmm2 13907; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] 13908; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm4 13909; AVX-NEXT: vinsertf128 $1, 160(%rdx), %ymm3, %ymm3 13910; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] 13911; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13912; AVX-NEXT: vmovaps 160(%r9), %xmm3 13913; AVX-NEXT: vmovaps 160(%r8), %xmm4 13914; AVX-NEXT: vmovaps 160(%rax), %xmm5 13915; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] 13916; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 13917; AVX-NEXT: vinsertf128 $1, 160(%r10), %ymm6, %ymm6 13918; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] 13919; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13920; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 13921; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 13922; AVX-NEXT: vbroadcastsd 168(%rdx), %ymm2 13923; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 13924; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] 13925; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13926; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] 13927; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 13928; AVX-NEXT: vbroadcastsd 168(%r10), %ymm2 13929; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 13930; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 13931; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13932; AVX-NEXT: vmovaps 192(%rsi), %xmm0 13933; AVX-NEXT: vmovaps 192(%rdi), %xmm1 13934; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 13935; AVX-NEXT: vmovaps 192(%rcx), %xmm3 13936; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 13937; AVX-NEXT: vinsertf128 $1, 192(%rdx), %ymm2, %ymm2 13938; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] 13939; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13940; AVX-NEXT: vmovaps 192(%r9), %xmm2 13941; AVX-NEXT: vmovaps 192(%r8), %xmm4 13942; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] 13943; AVX-NEXT: vmovaps 192(%rax), %xmm6 13944; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 13945; AVX-NEXT: vinsertf128 $1, 192(%r10), %ymm5, %ymm5 13946; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] 13947; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13948; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 13949; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 13950; AVX-NEXT: vbroadcastsd 200(%rdx), %ymm3 13951; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 13952; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 13953; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13954; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] 13955; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 13956; AVX-NEXT: vbroadcastsd 200(%r10), %ymm2 13957; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 13958; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 13959; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13960; AVX-NEXT: vmovaps 224(%rcx), %xmm0 13961; AVX-NEXT: vmovaps 224(%rsi), %xmm1 13962; AVX-NEXT: vmovaps 224(%rdi), %xmm2 13963; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] 13964; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm4 13965; AVX-NEXT: vinsertf128 $1, 224(%rdx), %ymm3, %ymm3 13966; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] 13967; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13968; AVX-NEXT: vmovaps 224(%r9), %xmm3 13969; AVX-NEXT: vmovaps 224(%r8), %xmm4 13970; AVX-NEXT: vmovaps 224(%rax), %xmm5 13971; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] 13972; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 13973; AVX-NEXT: vinsertf128 $1, 224(%r10), %ymm6, %ymm6 13974; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] 13975; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13976; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 13977; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 13978; AVX-NEXT: vbroadcastsd 232(%rdx), %ymm2 13979; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 13980; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] 13981; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13982; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] 13983; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 13984; AVX-NEXT: vbroadcastsd 232(%r10), %ymm2 13985; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 13986; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 13987; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13988; AVX-NEXT: vmovaps 256(%rsi), %xmm0 13989; AVX-NEXT: vmovaps 256(%rdi), %xmm1 13990; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 13991; AVX-NEXT: vmovaps 256(%rcx), %xmm3 13992; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 13993; AVX-NEXT: vinsertf128 $1, 256(%rdx), %ymm2, %ymm2 13994; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] 13995; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13996; AVX-NEXT: vmovaps 256(%r9), %xmm2 13997; AVX-NEXT: vmovaps 256(%r8), %xmm4 13998; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] 13999; AVX-NEXT: vmovaps 256(%rax), %xmm6 14000; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 14001; AVX-NEXT: vinsertf128 $1, 256(%r10), %ymm5, %ymm5 14002; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] 14003; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14004; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14005; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 14006; AVX-NEXT: vbroadcastsd 264(%rdx), %ymm3 14007; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 14008; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 14009; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14010; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] 14011; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 14012; AVX-NEXT: vbroadcastsd 264(%r10), %ymm2 14013; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 14014; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 14015; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14016; AVX-NEXT: vmovaps 288(%rsi), %xmm0 14017; AVX-NEXT: vmovaps 288(%rdi), %xmm1 14018; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14019; AVX-NEXT: vmovaps 288(%rcx), %xmm3 14020; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 14021; AVX-NEXT: vinsertf128 $1, 288(%rdx), %ymm2, %ymm2 14022; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] 14023; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14024; AVX-NEXT: vmovaps 288(%r9), %xmm2 14025; AVX-NEXT: vmovaps 288(%r8), %xmm4 14026; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] 14027; AVX-NEXT: vmovaps 288(%rax), %xmm6 14028; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 14029; AVX-NEXT: vinsertf128 $1, 288(%r10), %ymm5, %ymm5 14030; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] 14031; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14032; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14033; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 14034; AVX-NEXT: vbroadcastsd 296(%rdx), %ymm3 14035; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 14036; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 14037; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14038; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] 14039; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 14040; AVX-NEXT: vbroadcastsd 296(%r10), %ymm2 14041; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 14042; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 14043; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14044; AVX-NEXT: vmovaps 320(%rsi), %xmm0 14045; AVX-NEXT: vmovaps 320(%rdi), %xmm1 14046; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14047; AVX-NEXT: vmovaps 320(%rcx), %xmm3 14048; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 14049; AVX-NEXT: vinsertf128 $1, 320(%rdx), %ymm2, %ymm2 14050; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] 14051; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14052; AVX-NEXT: vmovaps 320(%r9), %xmm2 14053; AVX-NEXT: vmovaps 320(%r8), %xmm4 14054; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] 14055; AVX-NEXT: vmovaps 320(%rax), %xmm6 14056; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 14057; AVX-NEXT: vinsertf128 $1, 320(%r10), %ymm5, %ymm5 14058; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] 14059; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14060; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14061; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 14062; AVX-NEXT: vbroadcastsd 328(%rdx), %ymm3 14063; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 14064; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 14065; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14066; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] 14067; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 14068; AVX-NEXT: vbroadcastsd 328(%r10), %ymm2 14069; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 14070; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 14071; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14072; AVX-NEXT: vmovaps 352(%rcx), %xmm0 14073; AVX-NEXT: vmovaps 352(%rsi), %xmm1 14074; AVX-NEXT: vmovaps 352(%rdi), %xmm2 14075; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] 14076; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm4 14077; AVX-NEXT: vinsertf128 $1, 352(%rdx), %ymm3, %ymm3 14078; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] 14079; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14080; AVX-NEXT: vmovaps 352(%r9), %xmm3 14081; AVX-NEXT: vmovaps 352(%r8), %xmm4 14082; AVX-NEXT: vmovaps 352(%rax), %xmm5 14083; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] 14084; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 14085; AVX-NEXT: vinsertf128 $1, 352(%r10), %ymm6, %ymm6 14086; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] 14087; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14088; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 14089; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 14090; AVX-NEXT: vbroadcastsd 360(%rdx), %ymm2 14091; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 14092; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] 14093; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14094; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] 14095; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 14096; AVX-NEXT: vbroadcastsd 360(%r10), %ymm2 14097; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 14098; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 14099; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14100; AVX-NEXT: vmovaps 384(%rcx), %xmm0 14101; AVX-NEXT: vmovaps 384(%rsi), %xmm1 14102; AVX-NEXT: vmovaps 384(%rdi), %xmm2 14103; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] 14104; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm4 14105; AVX-NEXT: vinsertf128 $1, 384(%rdx), %ymm3, %ymm3 14106; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] 14107; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14108; AVX-NEXT: vmovaps 384(%r9), %xmm3 14109; AVX-NEXT: vmovaps 384(%r8), %xmm4 14110; AVX-NEXT: vmovaps 384(%rax), %xmm5 14111; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] 14112; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 14113; AVX-NEXT: vinsertf128 $1, 384(%r10), %ymm6, %ymm6 14114; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] 14115; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14116; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 14117; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 14118; AVX-NEXT: vbroadcastsd 392(%rdx), %ymm2 14119; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 14120; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] 14121; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14122; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] 14123; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 14124; AVX-NEXT: vbroadcastsd 392(%r10), %ymm2 14125; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 14126; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 14127; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14128; AVX-NEXT: vmovaps 416(%rcx), %xmm0 14129; AVX-NEXT: vmovaps 416(%rsi), %xmm1 14130; AVX-NEXT: vmovaps 416(%rdi), %xmm2 14131; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] 14132; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm4 14133; AVX-NEXT: vinsertf128 $1, 416(%rdx), %ymm3, %ymm3 14134; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] 14135; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14136; AVX-NEXT: vmovaps 416(%r9), %xmm3 14137; AVX-NEXT: vmovaps 416(%r8), %xmm4 14138; AVX-NEXT: vmovaps 416(%rax), %xmm5 14139; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] 14140; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 14141; AVX-NEXT: vinsertf128 $1, 416(%r10), %ymm6, %ymm6 14142; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] 14143; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14144; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 14145; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 14146; AVX-NEXT: vbroadcastsd 424(%rdx), %ymm2 14147; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 14148; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] 14149; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14150; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] 14151; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 14152; AVX-NEXT: vbroadcastsd 424(%r10), %ymm2 14153; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 14154; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 14155; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14156; AVX-NEXT: vmovaps 448(%rcx), %xmm0 14157; AVX-NEXT: vmovaps 448(%rsi), %xmm1 14158; AVX-NEXT: vmovaps 448(%rdi), %xmm2 14159; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] 14160; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm4 14161; AVX-NEXT: vinsertf128 $1, 448(%rdx), %ymm3, %ymm3 14162; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] 14163; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14164; AVX-NEXT: vmovaps 448(%r9), %xmm3 14165; AVX-NEXT: vmovaps 448(%r8), %xmm4 14166; AVX-NEXT: vmovaps 448(%rax), %xmm5 14167; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] 14168; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 14169; AVX-NEXT: vinsertf128 $1, 448(%r10), %ymm6, %ymm6 14170; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] 14171; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14172; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 14173; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 14174; AVX-NEXT: vbroadcastsd 456(%rdx), %ymm2 14175; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 14176; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] 14177; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14178; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] 14179; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 14180; AVX-NEXT: vbroadcastsd 456(%r10), %ymm2 14181; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 14182; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 14183; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14184; AVX-NEXT: vmovaps 480(%rcx), %xmm0 14185; AVX-NEXT: vmovaps 480(%rsi), %xmm1 14186; AVX-NEXT: vmovaps 480(%rdi), %xmm2 14187; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] 14188; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm4 14189; AVX-NEXT: vinsertf128 $1, 480(%rdx), %ymm3, %ymm3 14190; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] 14191; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14192; AVX-NEXT: vmovaps 480(%r9), %xmm3 14193; AVX-NEXT: vmovaps 480(%r8), %xmm4 14194; AVX-NEXT: vmovaps 480(%rax), %xmm5 14195; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] 14196; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 14197; AVX-NEXT: vinsertf128 $1, 480(%r10), %ymm6, %ymm6 14198; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] 14199; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14200; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 14201; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 14202; AVX-NEXT: vbroadcastsd 488(%rdx), %ymm2 14203; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 14204; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] 14205; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14206; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] 14207; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 14208; AVX-NEXT: vbroadcastsd 488(%r10), %ymm2 14209; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 14210; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 14211; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14212; AVX-NEXT: vmovaps 16(%rsi), %xmm0 14213; AVX-NEXT: vmovaps 16(%rdi), %xmm1 14214; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14215; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 14216; AVX-NEXT: vbroadcastsd 16(%rcx), %ymm3 14217; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 14218; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14219; AVX-NEXT: vmovaps 16(%r9), %xmm2 14220; AVX-NEXT: vmovaps 16(%r8), %xmm3 14221; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 14222; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 14223; AVX-NEXT: vbroadcastsd 16(%rax), %ymm5 14224; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 14225; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14226; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14227; AVX-NEXT: vbroadcastsd 24(%rdx), %ymm1 14228; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14229; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 14230; AVX-NEXT: vbroadcastsd 24(%r10), %ymm1 14231; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14232; AVX-NEXT: vmovaps 48(%rsi), %xmm0 14233; AVX-NEXT: vmovaps 48(%rdi), %xmm1 14234; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14235; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 14236; AVX-NEXT: vbroadcastsd 48(%rcx), %ymm3 14237; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 14238; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14239; AVX-NEXT: vmovaps 48(%r9), %xmm2 14240; AVX-NEXT: vmovaps 48(%r8), %xmm3 14241; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 14242; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 14243; AVX-NEXT: vbroadcastsd 48(%rax), %ymm5 14244; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 14245; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14246; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14247; AVX-NEXT: vbroadcastsd 56(%rdx), %ymm1 14248; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14249; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 14250; AVX-NEXT: vbroadcastsd 56(%r10), %ymm1 14251; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14252; AVX-NEXT: vmovaps 80(%rsi), %xmm0 14253; AVX-NEXT: vmovaps 80(%rdi), %xmm1 14254; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14255; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 14256; AVX-NEXT: vbroadcastsd 80(%rcx), %ymm3 14257; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 14258; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14259; AVX-NEXT: vmovaps 80(%r9), %xmm2 14260; AVX-NEXT: vmovaps 80(%r8), %xmm3 14261; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 14262; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 14263; AVX-NEXT: vbroadcastsd 80(%rax), %ymm5 14264; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 14265; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14266; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14267; AVX-NEXT: vbroadcastsd 88(%rdx), %ymm1 14268; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14269; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 14270; AVX-NEXT: vbroadcastsd 88(%r10), %ymm1 14271; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14272; AVX-NEXT: vmovaps 112(%rsi), %xmm0 14273; AVX-NEXT: vmovaps 112(%rdi), %xmm1 14274; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14275; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 14276; AVX-NEXT: vbroadcastsd 112(%rcx), %ymm3 14277; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 14278; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14279; AVX-NEXT: vmovaps 112(%r9), %xmm2 14280; AVX-NEXT: vmovaps 112(%r8), %xmm3 14281; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 14282; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 14283; AVX-NEXT: vbroadcastsd 112(%rax), %ymm5 14284; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 14285; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14286; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14287; AVX-NEXT: vbroadcastsd 120(%rdx), %ymm1 14288; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14289; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14290; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 14291; AVX-NEXT: vbroadcastsd 120(%r10), %ymm1 14292; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14293; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14294; AVX-NEXT: vmovaps 144(%rsi), %xmm0 14295; AVX-NEXT: vmovaps 144(%rdi), %xmm1 14296; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14297; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 14298; AVX-NEXT: vbroadcastsd 144(%rcx), %ymm3 14299; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 14300; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14301; AVX-NEXT: vmovaps 144(%r9), %xmm2 14302; AVX-NEXT: vmovaps 144(%r8), %xmm3 14303; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 14304; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 14305; AVX-NEXT: vbroadcastsd 144(%rax), %ymm5 14306; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 14307; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14308; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14309; AVX-NEXT: vbroadcastsd 152(%rdx), %ymm1 14310; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14311; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14312; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 14313; AVX-NEXT: vbroadcastsd 152(%r10), %ymm1 14314; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14315; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14316; AVX-NEXT: vmovaps 176(%rsi), %xmm0 14317; AVX-NEXT: vmovaps 176(%rdi), %xmm1 14318; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14319; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 14320; AVX-NEXT: vbroadcastsd 176(%rcx), %ymm3 14321; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 14322; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14323; AVX-NEXT: vmovaps 176(%r9), %xmm2 14324; AVX-NEXT: vmovaps 176(%r8), %xmm3 14325; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 14326; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 14327; AVX-NEXT: vbroadcastsd 176(%rax), %ymm5 14328; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 14329; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14330; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14331; AVX-NEXT: vbroadcastsd 184(%rdx), %ymm1 14332; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14333; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14334; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 14335; AVX-NEXT: vbroadcastsd 184(%r10), %ymm1 14336; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14337; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14338; AVX-NEXT: vmovaps 208(%rsi), %xmm0 14339; AVX-NEXT: vmovaps 208(%rdi), %xmm1 14340; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14341; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 14342; AVX-NEXT: vbroadcastsd 208(%rcx), %ymm3 14343; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 14344; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14345; AVX-NEXT: vmovaps 208(%r9), %xmm2 14346; AVX-NEXT: vmovaps 208(%r8), %xmm3 14347; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 14348; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 14349; AVX-NEXT: vbroadcastsd 208(%rax), %ymm5 14350; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 14351; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14352; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14353; AVX-NEXT: vbroadcastsd 216(%rdx), %ymm1 14354; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14355; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14356; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 14357; AVX-NEXT: vbroadcastsd 216(%r10), %ymm1 14358; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14359; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 14360; AVX-NEXT: vmovaps 240(%rsi), %xmm0 14361; AVX-NEXT: vmovaps 240(%rdi), %xmm1 14362; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14363; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 14364; AVX-NEXT: vbroadcastsd 240(%rcx), %ymm3 14365; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 14366; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14367; AVX-NEXT: vmovaps 240(%r9), %xmm2 14368; AVX-NEXT: vmovaps 240(%r8), %xmm3 14369; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 14370; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 14371; AVX-NEXT: vbroadcastsd 240(%rax), %ymm5 14372; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 14373; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14374; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14375; AVX-NEXT: vbroadcastsd 248(%rdx), %ymm1 14376; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14377; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14378; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 14379; AVX-NEXT: vbroadcastsd 248(%r10), %ymm1 14380; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14381; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14382; AVX-NEXT: vmovaps 272(%rsi), %xmm0 14383; AVX-NEXT: vmovaps 272(%rdi), %xmm1 14384; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14385; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 14386; AVX-NEXT: vbroadcastsd 272(%rcx), %ymm3 14387; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 14388; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14389; AVX-NEXT: vmovaps 272(%r9), %xmm2 14390; AVX-NEXT: vmovaps 272(%r8), %xmm3 14391; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 14392; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 14393; AVX-NEXT: vbroadcastsd 272(%rax), %ymm5 14394; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 14395; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14396; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14397; AVX-NEXT: vbroadcastsd 280(%rdx), %ymm1 14398; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14399; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14400; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 14401; AVX-NEXT: vbroadcastsd 280(%r10), %ymm1 14402; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14403; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14404; AVX-NEXT: vmovaps 304(%rsi), %xmm0 14405; AVX-NEXT: vmovaps 304(%rdi), %xmm1 14406; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14407; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 14408; AVX-NEXT: vbroadcastsd 304(%rcx), %ymm3 14409; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 14410; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14411; AVX-NEXT: vmovaps 304(%r9), %xmm2 14412; AVX-NEXT: vmovaps 304(%r8), %xmm3 14413; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 14414; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 14415; AVX-NEXT: vbroadcastsd 304(%rax), %ymm5 14416; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 14417; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14418; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14419; AVX-NEXT: vbroadcastsd 312(%rdx), %ymm1 14420; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14421; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14422; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 14423; AVX-NEXT: vbroadcastsd 312(%r10), %ymm1 14424; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14425; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14426; AVX-NEXT: vmovaps 336(%rsi), %xmm0 14427; AVX-NEXT: vmovaps 336(%rdi), %xmm1 14428; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14429; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 14430; AVX-NEXT: vbroadcastsd 336(%rcx), %ymm3 14431; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 14432; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14433; AVX-NEXT: vmovaps 336(%r9), %xmm2 14434; AVX-NEXT: vmovaps 336(%r8), %xmm3 14435; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 14436; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 14437; AVX-NEXT: vbroadcastsd 336(%rax), %ymm5 14438; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 14439; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14440; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14441; AVX-NEXT: vbroadcastsd 344(%rdx), %ymm1 14442; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14443; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14444; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 14445; AVX-NEXT: vbroadcastsd 344(%r10), %ymm1 14446; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14447; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14448; AVX-NEXT: vmovaps 368(%rsi), %xmm0 14449; AVX-NEXT: vmovaps 368(%rdi), %xmm1 14450; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14451; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 14452; AVX-NEXT: vbroadcastsd 368(%rcx), %ymm3 14453; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 14454; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14455; AVX-NEXT: vmovaps 368(%r9), %xmm2 14456; AVX-NEXT: vmovaps 368(%r8), %xmm3 14457; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 14458; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 14459; AVX-NEXT: vbroadcastsd 368(%rax), %ymm5 14460; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 14461; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14462; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14463; AVX-NEXT: vbroadcastsd 376(%rdx), %ymm1 14464; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14465; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14466; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 14467; AVX-NEXT: vbroadcastsd 376(%r10), %ymm1 14468; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14469; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14470; AVX-NEXT: vmovaps 400(%rsi), %xmm0 14471; AVX-NEXT: vmovaps 400(%rdi), %xmm1 14472; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14473; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 14474; AVX-NEXT: vbroadcastsd 400(%rcx), %ymm3 14475; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 14476; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14477; AVX-NEXT: vmovaps 400(%r9), %xmm2 14478; AVX-NEXT: vmovaps 400(%r8), %xmm3 14479; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 14480; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 14481; AVX-NEXT: vbroadcastsd 400(%rax), %ymm5 14482; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 14483; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14484; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14485; AVX-NEXT: vbroadcastsd 408(%rdx), %ymm1 14486; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14487; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14488; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 14489; AVX-NEXT: vbroadcastsd 408(%r10), %ymm1 14490; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14491; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14492; AVX-NEXT: vmovaps 432(%rsi), %xmm0 14493; AVX-NEXT: vmovaps 432(%rdi), %xmm1 14494; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14495; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 14496; AVX-NEXT: vbroadcastsd 432(%rcx), %ymm3 14497; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 14498; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14499; AVX-NEXT: vmovaps 432(%r9), %xmm2 14500; AVX-NEXT: vmovaps 432(%r8), %xmm3 14501; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 14502; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 14503; AVX-NEXT: vbroadcastsd 432(%rax), %ymm5 14504; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 14505; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14506; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14507; AVX-NEXT: vbroadcastsd 440(%rdx), %ymm1 14508; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14509; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14510; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 14511; AVX-NEXT: vbroadcastsd 440(%r10), %ymm1 14512; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14513; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14514; AVX-NEXT: vmovaps 464(%rsi), %xmm0 14515; AVX-NEXT: vmovaps 464(%rdi), %xmm1 14516; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14517; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 14518; AVX-NEXT: vbroadcastsd 464(%rcx), %ymm3 14519; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 14520; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14521; AVX-NEXT: vmovaps 464(%r9), %xmm2 14522; AVX-NEXT: vmovaps 464(%r8), %xmm3 14523; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] 14524; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 14525; AVX-NEXT: vbroadcastsd 464(%rax), %ymm5 14526; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 14527; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14528; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14529; AVX-NEXT: vbroadcastsd 472(%rdx), %ymm1 14530; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14531; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] 14532; AVX-NEXT: vbroadcastsd 472(%r10), %ymm1 14533; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14534; AVX-NEXT: vmovaps 496(%rsi), %xmm0 14535; AVX-NEXT: vmovaps 496(%rdi), %xmm1 14536; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 14537; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 14538; AVX-NEXT: vbroadcastsd 496(%rcx), %ymm3 14539; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3,4,5],ymm3[6,7] 14540; AVX-NEXT: vmovaps 496(%r9), %xmm3 14541; AVX-NEXT: vmovaps 496(%r8), %xmm4 14542; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm3[0] 14543; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] 14544; AVX-NEXT: vbroadcastsd 496(%rax), %ymm5 14545; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5],ymm5[6,7] 14546; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 14547; AVX-NEXT: vbroadcastsd 504(%rdx), %ymm1 14548; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14549; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] 14550; AVX-NEXT: vbroadcastsd 504(%r10), %ymm1 14551; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14552; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdx 14553; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] 14554; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14555; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],mem[6,7] 14556; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14557; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],mem[6,7] 14558; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14559; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] 14560; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14561; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],mem[6,7] 14562; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14563; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],mem[6,7] 14564; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14565; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14566; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 14567; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14568; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14569; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 14570; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14571; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14572; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 14573; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14574; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14575; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 14576; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14577; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14578; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 14579; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14580; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14581; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 14582; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14583; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14584; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 14585; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14586; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 14587; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 14588; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 14589; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14590; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 14591; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14592; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14593; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 14594; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14595; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14596; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 14597; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14598; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14599; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 14600; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14601; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14602; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],mem[6,7] 14603; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14604; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] 14605; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14606; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],mem[6,7] 14607; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14608; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],mem[6,7] 14609; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14610; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] 14611; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14612; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] 14613; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14614; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] 14615; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14616; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] 14617; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14618; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] 14619; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14620; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 14621; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] 14622; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] 14623; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] 14624; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] 14625; AVX-NEXT: vmovaps %ymm3, 4064(%rdx) 14626; AVX-NEXT: vmovaps %ymm5, 4032(%rdx) 14627; AVX-NEXT: vmovaps %ymm8, 4000(%rdx) 14628; AVX-NEXT: vmovaps %ymm11, 3968(%rdx) 14629; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 14630; AVX-NEXT: vmovaps %ymm3, 3936(%rdx) 14631; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 14632; AVX-NEXT: vmovaps %ymm3, 3904(%rdx) 14633; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 14634; AVX-NEXT: vmovaps %ymm3, 3872(%rdx) 14635; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 14636; AVX-NEXT: vmovaps %ymm3, 3840(%rdx) 14637; AVX-NEXT: vmovaps %ymm12, 3808(%rdx) 14638; AVX-NEXT: vmovaps %ymm13, 3776(%rdx) 14639; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 14640; AVX-NEXT: vmovaps %ymm3, 3744(%rdx) 14641; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 14642; AVX-NEXT: vmovaps %ymm3, 3712(%rdx) 14643; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 14644; AVX-NEXT: vmovaps %ymm3, 3680(%rdx) 14645; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 14646; AVX-NEXT: vmovaps %ymm3, 3648(%rdx) 14647; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 14648; AVX-NEXT: vmovaps %ymm3, 3616(%rdx) 14649; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 14650; AVX-NEXT: vmovaps %ymm3, 3584(%rdx) 14651; AVX-NEXT: vmovaps %ymm0, 3552(%rdx) 14652; AVX-NEXT: vmovaps %ymm1, 3520(%rdx) 14653; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14654; AVX-NEXT: vmovaps %ymm0, 3488(%rdx) 14655; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14656; AVX-NEXT: vmovaps %ymm0, 3456(%rdx) 14657; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14658; AVX-NEXT: vmovaps %ymm0, 3424(%rdx) 14659; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14660; AVX-NEXT: vmovaps %ymm0, 3392(%rdx) 14661; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14662; AVX-NEXT: vmovaps %ymm0, 3360(%rdx) 14663; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14664; AVX-NEXT: vmovaps %ymm0, 3328(%rdx) 14665; AVX-NEXT: vmovaps %ymm2, 3296(%rdx) 14666; AVX-NEXT: vmovaps %ymm4, 3264(%rdx) 14667; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14668; AVX-NEXT: vmovaps %ymm0, 3232(%rdx) 14669; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14670; AVX-NEXT: vmovaps %ymm0, 3200(%rdx) 14671; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14672; AVX-NEXT: vmovaps %ymm0, 3168(%rdx) 14673; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14674; AVX-NEXT: vmovaps %ymm0, 3136(%rdx) 14675; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14676; AVX-NEXT: vmovaps %ymm0, 3104(%rdx) 14677; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14678; AVX-NEXT: vmovaps %ymm0, 3072(%rdx) 14679; AVX-NEXT: vmovaps %ymm6, 3040(%rdx) 14680; AVX-NEXT: vmovaps %ymm7, 3008(%rdx) 14681; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14682; AVX-NEXT: vmovaps %ymm0, 2976(%rdx) 14683; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14684; AVX-NEXT: vmovaps %ymm0, 2944(%rdx) 14685; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14686; AVX-NEXT: vmovaps %ymm0, 2912(%rdx) 14687; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14688; AVX-NEXT: vmovaps %ymm0, 2880(%rdx) 14689; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14690; AVX-NEXT: vmovaps %ymm0, 2848(%rdx) 14691; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14692; AVX-NEXT: vmovaps %ymm0, 2816(%rdx) 14693; AVX-NEXT: vmovaps %ymm9, 2784(%rdx) 14694; AVX-NEXT: vmovaps %ymm10, 2752(%rdx) 14695; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14696; AVX-NEXT: vmovaps %ymm0, 2720(%rdx) 14697; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14698; AVX-NEXT: vmovaps %ymm0, 2688(%rdx) 14699; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14700; AVX-NEXT: vmovaps %ymm0, 2656(%rdx) 14701; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14702; AVX-NEXT: vmovaps %ymm0, 2624(%rdx) 14703; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14704; AVX-NEXT: vmovaps %ymm0, 2592(%rdx) 14705; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14706; AVX-NEXT: vmovaps %ymm0, 2560(%rdx) 14707; AVX-NEXT: vmovaps %ymm14, 2528(%rdx) 14708; AVX-NEXT: vmovaps %ymm15, 2496(%rdx) 14709; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14710; AVX-NEXT: vmovaps %ymm0, 2464(%rdx) 14711; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14712; AVX-NEXT: vmovaps %ymm0, 2432(%rdx) 14713; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14714; AVX-NEXT: vmovaps %ymm0, 2400(%rdx) 14715; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14716; AVX-NEXT: vmovaps %ymm0, 2368(%rdx) 14717; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14718; AVX-NEXT: vmovaps %ymm0, 2336(%rdx) 14719; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14720; AVX-NEXT: vmovaps %ymm0, 2304(%rdx) 14721; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14722; AVX-NEXT: vmovaps %ymm0, 2272(%rdx) 14723; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14724; AVX-NEXT: vmovaps %ymm0, 2240(%rdx) 14725; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14726; AVX-NEXT: vmovaps %ymm0, 2208(%rdx) 14727; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14728; AVX-NEXT: vmovaps %ymm0, 2176(%rdx) 14729; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14730; AVX-NEXT: vmovaps %ymm0, 2144(%rdx) 14731; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14732; AVX-NEXT: vmovaps %ymm0, 2112(%rdx) 14733; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14734; AVX-NEXT: vmovaps %ymm0, 2080(%rdx) 14735; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14736; AVX-NEXT: vmovaps %ymm0, 2048(%rdx) 14737; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14738; AVX-NEXT: vmovaps %ymm0, 2016(%rdx) 14739; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14740; AVX-NEXT: vmovaps %ymm0, 1984(%rdx) 14741; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14742; AVX-NEXT: vmovaps %ymm0, 1952(%rdx) 14743; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14744; AVX-NEXT: vmovaps %ymm0, 1920(%rdx) 14745; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14746; AVX-NEXT: vmovaps %ymm0, 1888(%rdx) 14747; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14748; AVX-NEXT: vmovaps %ymm0, 1856(%rdx) 14749; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14750; AVX-NEXT: vmovaps %ymm0, 1824(%rdx) 14751; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14752; AVX-NEXT: vmovaps %ymm0, 1792(%rdx) 14753; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 14754; AVX-NEXT: vmovaps %ymm0, 1760(%rdx) 14755; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14756; AVX-NEXT: vmovaps %ymm0, 1728(%rdx) 14757; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14758; AVX-NEXT: vmovaps %ymm0, 1696(%rdx) 14759; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14760; AVX-NEXT: vmovaps %ymm0, 1664(%rdx) 14761; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14762; AVX-NEXT: vmovaps %ymm0, 1632(%rdx) 14763; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14764; AVX-NEXT: vmovaps %ymm0, 1600(%rdx) 14765; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14766; AVX-NEXT: vmovaps %ymm0, 1568(%rdx) 14767; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14768; AVX-NEXT: vmovaps %ymm0, 1536(%rdx) 14769; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14770; AVX-NEXT: vmovaps %ymm0, 1504(%rdx) 14771; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14772; AVX-NEXT: vmovaps %ymm0, 1472(%rdx) 14773; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14774; AVX-NEXT: vmovaps %ymm0, 1440(%rdx) 14775; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14776; AVX-NEXT: vmovaps %ymm0, 1408(%rdx) 14777; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14778; AVX-NEXT: vmovaps %ymm0, 1376(%rdx) 14779; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14780; AVX-NEXT: vmovaps %ymm0, 1344(%rdx) 14781; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14782; AVX-NEXT: vmovaps %ymm0, 1312(%rdx) 14783; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14784; AVX-NEXT: vmovaps %ymm0, 1280(%rdx) 14785; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14786; AVX-NEXT: vmovaps %ymm0, 1248(%rdx) 14787; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14788; AVX-NEXT: vmovaps %ymm0, 1216(%rdx) 14789; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14790; AVX-NEXT: vmovaps %ymm0, 1184(%rdx) 14791; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14792; AVX-NEXT: vmovaps %ymm0, 1152(%rdx) 14793; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14794; AVX-NEXT: vmovaps %ymm0, 1120(%rdx) 14795; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14796; AVX-NEXT: vmovaps %ymm0, 1088(%rdx) 14797; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14798; AVX-NEXT: vmovaps %ymm0, 1056(%rdx) 14799; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14800; AVX-NEXT: vmovaps %ymm0, 1024(%rdx) 14801; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14802; AVX-NEXT: vmovaps %ymm0, 992(%rdx) 14803; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14804; AVX-NEXT: vmovaps %ymm0, 960(%rdx) 14805; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14806; AVX-NEXT: vmovaps %ymm0, 928(%rdx) 14807; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14808; AVX-NEXT: vmovaps %ymm0, 896(%rdx) 14809; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14810; AVX-NEXT: vmovaps %ymm0, 864(%rdx) 14811; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14812; AVX-NEXT: vmovaps %ymm0, 832(%rdx) 14813; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14814; AVX-NEXT: vmovaps %ymm0, 800(%rdx) 14815; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14816; AVX-NEXT: vmovaps %ymm0, 768(%rdx) 14817; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14818; AVX-NEXT: vmovaps %ymm0, 736(%rdx) 14819; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14820; AVX-NEXT: vmovaps %ymm0, 704(%rdx) 14821; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14822; AVX-NEXT: vmovaps %ymm0, 672(%rdx) 14823; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14824; AVX-NEXT: vmovaps %ymm0, 640(%rdx) 14825; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14826; AVX-NEXT: vmovaps %ymm0, 608(%rdx) 14827; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14828; AVX-NEXT: vmovaps %ymm0, 576(%rdx) 14829; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14830; AVX-NEXT: vmovaps %ymm0, 544(%rdx) 14831; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14832; AVX-NEXT: vmovaps %ymm0, 512(%rdx) 14833; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14834; AVX-NEXT: vmovaps %ymm0, 480(%rdx) 14835; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14836; AVX-NEXT: vmovaps %ymm0, 448(%rdx) 14837; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14838; AVX-NEXT: vmovaps %ymm0, 416(%rdx) 14839; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14840; AVX-NEXT: vmovaps %ymm0, 384(%rdx) 14841; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14842; AVX-NEXT: vmovaps %ymm0, 352(%rdx) 14843; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14844; AVX-NEXT: vmovaps %ymm0, 320(%rdx) 14845; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14846; AVX-NEXT: vmovaps %ymm0, 288(%rdx) 14847; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14848; AVX-NEXT: vmovaps %ymm0, 256(%rdx) 14849; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14850; AVX-NEXT: vmovaps %ymm0, 224(%rdx) 14851; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14852; AVX-NEXT: vmovaps %ymm0, 192(%rdx) 14853; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14854; AVX-NEXT: vmovaps %ymm0, 160(%rdx) 14855; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14856; AVX-NEXT: vmovaps %ymm0, 128(%rdx) 14857; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14858; AVX-NEXT: vmovaps %ymm0, 96(%rdx) 14859; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14860; AVX-NEXT: vmovaps %ymm0, 64(%rdx) 14861; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14862; AVX-NEXT: vmovaps %ymm0, 32(%rdx) 14863; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14864; AVX-NEXT: vmovaps %ymm0, (%rdx) 14865; AVX-NEXT: addq $3784, %rsp # imm = 0xEC8 14866; AVX-NEXT: vzeroupper 14867; AVX-NEXT: retq 14868; 14869; AVX2-LABEL: store_i64_stride8_vf64: 14870; AVX2: # %bb.0: 14871; AVX2-NEXT: subq $3880, %rsp # imm = 0xF28 14872; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 14873; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 14874; AVX2-NEXT: vmovaps (%rcx), %xmm0 14875; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14876; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 14877; AVX2-NEXT: vmovaps (%rsi), %xmm2 14878; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14879; AVX2-NEXT: vmovaps 32(%rsi), %xmm3 14880; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14881; AVX2-NEXT: vmovaps (%rdi), %xmm1 14882; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14883; AVX2-NEXT: vmovaps 32(%rdi), %xmm4 14884; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14885; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 14886; AVX2-NEXT: vbroadcastsd 8(%rdx), %ymm2 14887; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 14888; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 14889; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14890; AVX2-NEXT: vmovaps (%rax), %xmm0 14891; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14892; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 14893; AVX2-NEXT: vmovaps (%r9), %xmm2 14894; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14895; AVX2-NEXT: vmovaps (%r8), %xmm1 14896; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14897; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 14898; AVX2-NEXT: vbroadcastsd 8(%r10), %ymm2 14899; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 14900; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 14901; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14902; AVX2-NEXT: vmovaps 32(%rcx), %xmm0 14903; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14904; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 14905; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1] 14906; AVX2-NEXT: vbroadcastsd 40(%rdx), %ymm2 14907; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 14908; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 14909; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14910; AVX2-NEXT: vmovaps 32(%r9), %xmm1 14911; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14912; AVX2-NEXT: vmovaps 32(%r8), %xmm0 14913; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14914; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 14915; AVX2-NEXT: vbroadcastsd 40(%r10), %ymm1 14916; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14917; AVX2-NEXT: vmovaps 32(%rax), %xmm1 14918; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14919; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14920; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 14921; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14922; AVX2-NEXT: vmovaps 64(%rsi), %xmm1 14923; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14924; AVX2-NEXT: vmovaps 64(%rdi), %xmm0 14925; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14926; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 14927; AVX2-NEXT: vbroadcastsd 72(%rdx), %ymm1 14928; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14929; AVX2-NEXT: vmovaps 64(%rcx), %xmm1 14930; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14931; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14932; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 14933; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14934; AVX2-NEXT: vmovaps 64(%r9), %xmm1 14935; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14936; AVX2-NEXT: vmovaps 64(%r8), %xmm0 14937; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14938; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 14939; AVX2-NEXT: vbroadcastsd 72(%r10), %ymm1 14940; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14941; AVX2-NEXT: vmovaps 64(%rax), %xmm1 14942; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14943; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14944; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 14945; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14946; AVX2-NEXT: vmovaps 96(%rsi), %xmm1 14947; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14948; AVX2-NEXT: vmovaps 96(%rdi), %xmm0 14949; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14950; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 14951; AVX2-NEXT: vbroadcastsd 104(%rdx), %ymm1 14952; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14953; AVX2-NEXT: vmovaps 96(%rcx), %xmm1 14954; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14955; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14956; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 14957; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14958; AVX2-NEXT: vmovaps 96(%r9), %xmm1 14959; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14960; AVX2-NEXT: vmovaps 96(%r8), %xmm0 14961; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14962; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 14963; AVX2-NEXT: vbroadcastsd 104(%r10), %ymm1 14964; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14965; AVX2-NEXT: vmovaps 96(%rax), %xmm1 14966; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14967; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14968; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 14969; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14970; AVX2-NEXT: vmovaps 128(%rsi), %xmm1 14971; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14972; AVX2-NEXT: vmovaps 128(%rdi), %xmm0 14973; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14974; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 14975; AVX2-NEXT: vbroadcastsd 136(%rdx), %ymm1 14976; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14977; AVX2-NEXT: vmovaps 128(%rcx), %xmm1 14978; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14979; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14980; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 14981; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14982; AVX2-NEXT: vmovaps 128(%r9), %xmm1 14983; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14984; AVX2-NEXT: vmovaps 128(%r8), %xmm0 14985; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14986; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 14987; AVX2-NEXT: vbroadcastsd 136(%r10), %ymm1 14988; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14989; AVX2-NEXT: vmovaps 128(%rax), %xmm1 14990; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14991; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14992; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 14993; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14994; AVX2-NEXT: vmovaps 160(%rsi), %xmm1 14995; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14996; AVX2-NEXT: vmovaps 160(%rdi), %xmm0 14997; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14998; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 14999; AVX2-NEXT: vbroadcastsd 168(%rdx), %ymm1 15000; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15001; AVX2-NEXT: vmovaps 160(%rcx), %xmm1 15002; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15003; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 15004; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15005; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15006; AVX2-NEXT: vmovaps 160(%r9), %xmm1 15007; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15008; AVX2-NEXT: vmovaps 160(%r8), %xmm0 15009; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15010; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 15011; AVX2-NEXT: vbroadcastsd 168(%r10), %ymm1 15012; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15013; AVX2-NEXT: vmovaps 160(%rax), %xmm1 15014; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15015; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 15016; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15017; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15018; AVX2-NEXT: vmovaps 192(%rsi), %xmm1 15019; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15020; AVX2-NEXT: vmovaps 192(%rdi), %xmm0 15021; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15022; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 15023; AVX2-NEXT: vbroadcastsd 200(%rdx), %ymm1 15024; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15025; AVX2-NEXT: vmovaps 192(%rcx), %xmm1 15026; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15027; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 15028; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15029; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15030; AVX2-NEXT: vmovaps 192(%r9), %xmm1 15031; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15032; AVX2-NEXT: vmovaps 192(%r8), %xmm0 15033; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15034; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 15035; AVX2-NEXT: vbroadcastsd 200(%r10), %ymm1 15036; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15037; AVX2-NEXT: vmovaps 192(%rax), %xmm1 15038; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15039; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 15040; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15041; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15042; AVX2-NEXT: vmovaps 224(%rsi), %xmm1 15043; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15044; AVX2-NEXT: vmovaps 224(%rdi), %xmm0 15045; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15046; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 15047; AVX2-NEXT: vbroadcastsd 232(%rdx), %ymm1 15048; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15049; AVX2-NEXT: vmovaps 224(%rcx), %xmm1 15050; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15051; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 15052; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15053; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15054; AVX2-NEXT: vmovaps 224(%r9), %xmm1 15055; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15056; AVX2-NEXT: vmovaps 224(%r8), %xmm0 15057; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15058; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 15059; AVX2-NEXT: vbroadcastsd 232(%r10), %ymm1 15060; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15061; AVX2-NEXT: vmovaps 224(%rax), %xmm1 15062; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15063; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 15064; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15065; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15066; AVX2-NEXT: vmovaps 256(%rsi), %xmm1 15067; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15068; AVX2-NEXT: vmovaps 256(%rdi), %xmm0 15069; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15070; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 15071; AVX2-NEXT: vbroadcastsd 264(%rdx), %ymm1 15072; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15073; AVX2-NEXT: vmovaps 256(%rcx), %xmm1 15074; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15075; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 15076; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15077; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15078; AVX2-NEXT: vmovaps 256(%r9), %xmm1 15079; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15080; AVX2-NEXT: vmovaps 256(%r8), %xmm0 15081; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15082; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 15083; AVX2-NEXT: vbroadcastsd 264(%r10), %ymm1 15084; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15085; AVX2-NEXT: vmovaps 256(%rax), %xmm1 15086; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15087; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 15088; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15089; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15090; AVX2-NEXT: vmovaps 288(%rsi), %xmm1 15091; AVX2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill 15092; AVX2-NEXT: vmovaps 288(%rdi), %xmm0 15093; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15094; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 15095; AVX2-NEXT: vbroadcastsd 296(%rdx), %ymm1 15096; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15097; AVX2-NEXT: vmovaps 288(%rcx), %xmm1 15098; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15099; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 15100; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15101; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15102; AVX2-NEXT: vmovaps 288(%r9), %xmm1 15103; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15104; AVX2-NEXT: vmovaps 288(%r8), %xmm0 15105; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15106; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 15107; AVX2-NEXT: vbroadcastsd 296(%r10), %ymm1 15108; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15109; AVX2-NEXT: vmovaps 288(%rax), %xmm1 15110; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15111; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 15112; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15113; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15114; AVX2-NEXT: vmovaps 320(%rsi), %xmm1 15115; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15116; AVX2-NEXT: vmovaps 320(%rdi), %xmm0 15117; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15118; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 15119; AVX2-NEXT: vbroadcastsd 328(%rdx), %ymm1 15120; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15121; AVX2-NEXT: vmovaps 320(%rcx), %xmm1 15122; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15123; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 15124; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15125; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15126; AVX2-NEXT: vmovaps 320(%r9), %xmm1 15127; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15128; AVX2-NEXT: vmovaps 320(%r8), %xmm0 15129; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15130; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 15131; AVX2-NEXT: vbroadcastsd 328(%r10), %ymm1 15132; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15133; AVX2-NEXT: vmovaps 320(%rax), %xmm1 15134; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15135; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 15136; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15137; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15138; AVX2-NEXT: vmovaps 352(%rsi), %xmm1 15139; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15140; AVX2-NEXT: vmovaps 352(%rdi), %xmm0 15141; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15142; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 15143; AVX2-NEXT: vbroadcastsd 360(%rdx), %ymm1 15144; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15145; AVX2-NEXT: vmovaps 352(%rcx), %xmm1 15146; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15147; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 15148; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15149; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15150; AVX2-NEXT: vmovaps 352(%r9), %xmm1 15151; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15152; AVX2-NEXT: vmovaps 352(%r8), %xmm0 15153; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15154; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 15155; AVX2-NEXT: vbroadcastsd 360(%r10), %ymm1 15156; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15157; AVX2-NEXT: vmovaps 352(%rax), %xmm1 15158; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15159; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 15160; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15161; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15162; AVX2-NEXT: vmovaps 384(%rsi), %xmm1 15163; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15164; AVX2-NEXT: vmovaps 384(%rdi), %xmm0 15165; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15166; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 15167; AVX2-NEXT: vbroadcastsd 392(%rdx), %ymm1 15168; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15169; AVX2-NEXT: vmovaps 384(%rcx), %xmm1 15170; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15171; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 15172; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15173; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15174; AVX2-NEXT: vmovaps 384(%r9), %xmm1 15175; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15176; AVX2-NEXT: vmovaps 384(%r8), %xmm0 15177; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15178; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 15179; AVX2-NEXT: vbroadcastsd 392(%r10), %ymm1 15180; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15181; AVX2-NEXT: vmovaps 384(%rax), %xmm1 15182; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15183; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 15184; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15185; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15186; AVX2-NEXT: vmovaps 416(%rsi), %xmm1 15187; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15188; AVX2-NEXT: vmovaps 416(%rdi), %xmm0 15189; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15190; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 15191; AVX2-NEXT: vbroadcastsd 424(%rdx), %ymm1 15192; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15193; AVX2-NEXT: vmovaps 416(%rcx), %xmm1 15194; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15195; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 15196; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15197; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15198; AVX2-NEXT: vmovaps 416(%r9), %xmm0 15199; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 15200; AVX2-NEXT: vmovaps 416(%r8), %xmm13 15201; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] 15202; AVX2-NEXT: vbroadcastsd 424(%r10), %ymm1 15203; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15204; AVX2-NEXT: vmovaps 416(%rax), %xmm12 15205; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 15206; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15207; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15208; AVX2-NEXT: vmovaps 448(%rsi), %xmm11 15209; AVX2-NEXT: vmovaps 448(%rdi), %xmm10 15210; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] 15211; AVX2-NEXT: vbroadcastsd 456(%rdx), %ymm1 15212; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15213; AVX2-NEXT: vmovaps 448(%rcx), %xmm9 15214; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 15215; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15216; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15217; AVX2-NEXT: vmovaps 448(%r9), %xmm8 15218; AVX2-NEXT: vmovaps 448(%r8), %xmm7 15219; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] 15220; AVX2-NEXT: vbroadcastsd 456(%r10), %ymm1 15221; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15222; AVX2-NEXT: vmovaps 448(%rax), %xmm6 15223; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 15224; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15225; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15226; AVX2-NEXT: vmovaps 480(%rsi), %xmm5 15227; AVX2-NEXT: vmovaps 480(%rdi), %xmm4 15228; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] 15229; AVX2-NEXT: vbroadcastsd 488(%rdx), %ymm1 15230; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15231; AVX2-NEXT: vmovaps 480(%rcx), %xmm3 15232; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 15233; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 15234; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15235; AVX2-NEXT: vmovaps 480(%r9), %xmm2 15236; AVX2-NEXT: vmovaps 480(%r8), %xmm1 15237; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] 15238; AVX2-NEXT: vbroadcastsd 488(%r10), %ymm15 15239; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] 15240; AVX2-NEXT: vmovaps 480(%rax), %xmm0 15241; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 15242; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] 15243; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15244; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15245; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15246; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15247; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm14, %ymm14 15248; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15249; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15250; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15251; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15252; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15253; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15254; AVX2-NEXT: vinsertf128 $1, (%r10), %ymm14, %ymm14 15255; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15256; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15257; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15258; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15259; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15260; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15261; AVX2-NEXT: vinsertf128 $1, 32(%rdx), %ymm14, %ymm14 15262; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15263; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15264; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15265; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15266; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15267; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15268; AVX2-NEXT: vinsertf128 $1, 32(%r10), %ymm14, %ymm14 15269; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15270; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15271; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15272; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15273; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15274; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15275; AVX2-NEXT: vinsertf128 $1, 64(%rdx), %ymm14, %ymm14 15276; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15277; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15278; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15279; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15280; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15281; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15282; AVX2-NEXT: vinsertf128 $1, 64(%r10), %ymm14, %ymm14 15283; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15284; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15285; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15286; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15287; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15288; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15289; AVX2-NEXT: vinsertf128 $1, 96(%rdx), %ymm14, %ymm14 15290; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15291; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15292; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15293; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15294; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15295; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15296; AVX2-NEXT: vinsertf128 $1, 96(%r10), %ymm14, %ymm14 15297; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15298; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15299; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15300; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15301; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15302; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15303; AVX2-NEXT: vinsertf128 $1, 128(%rdx), %ymm14, %ymm14 15304; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15305; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15306; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15307; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15308; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15309; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15310; AVX2-NEXT: vinsertf128 $1, 128(%r10), %ymm14, %ymm14 15311; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15312; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15313; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15314; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15315; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15316; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15317; AVX2-NEXT: vinsertf128 $1, 160(%rdx), %ymm14, %ymm14 15318; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15319; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15320; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15321; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15322; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15323; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15324; AVX2-NEXT: vinsertf128 $1, 160(%r10), %ymm14, %ymm14 15325; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15326; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15327; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15328; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15329; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15330; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15331; AVX2-NEXT: vinsertf128 $1, 192(%rdx), %ymm14, %ymm14 15332; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15333; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15334; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15335; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15336; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15337; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15338; AVX2-NEXT: vinsertf128 $1, 192(%r10), %ymm14, %ymm14 15339; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15340; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15341; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15342; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15343; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15344; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15345; AVX2-NEXT: vinsertf128 $1, 224(%rdx), %ymm14, %ymm14 15346; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15347; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15348; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15349; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15350; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15351; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15352; AVX2-NEXT: vinsertf128 $1, 224(%r10), %ymm14, %ymm14 15353; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15354; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15355; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15356; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15357; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15358; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15359; AVX2-NEXT: vinsertf128 $1, 256(%rdx), %ymm14, %ymm14 15360; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15361; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15362; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15363; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15364; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15365; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15366; AVX2-NEXT: vinsertf128 $1, 256(%r10), %ymm14, %ymm14 15367; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15368; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15369; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15370; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15371; AVX2-NEXT: vunpcklpd (%rsp), %xmm14, %xmm14 # 16-byte Folded Reload 15372; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15373; AVX2-NEXT: vinsertf128 $1, 288(%rdx), %ymm14, %ymm14 15374; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15375; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15376; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15377; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15378; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15379; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15380; AVX2-NEXT: vinsertf128 $1, 288(%r10), %ymm14, %ymm14 15381; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15382; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15383; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15384; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15385; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15386; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15387; AVX2-NEXT: vinsertf128 $1, 320(%rdx), %ymm14, %ymm14 15388; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15389; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15390; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15391; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15392; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15393; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15394; AVX2-NEXT: vinsertf128 $1, 320(%r10), %ymm14, %ymm14 15395; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15396; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15397; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15398; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15399; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15400; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15401; AVX2-NEXT: vinsertf128 $1, 352(%rdx), %ymm14, %ymm14 15402; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15403; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15404; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15405; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15406; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15407; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15408; AVX2-NEXT: vinsertf128 $1, 352(%r10), %ymm14, %ymm14 15409; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15410; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15411; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15412; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15413; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15414; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15415; AVX2-NEXT: vinsertf128 $1, 384(%rdx), %ymm14, %ymm14 15416; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15417; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15418; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15419; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15420; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15421; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15422; AVX2-NEXT: vinsertf128 $1, 384(%r10), %ymm14, %ymm14 15423; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15424; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15425; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15426; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 15427; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 15428; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] 15429; AVX2-NEXT: vinsertf128 $1, 416(%rdx), %ymm14, %ymm14 15430; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 15431; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 15432; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15433; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 15434; AVX2-NEXT: # xmm13 = xmm13[0],mem[0] 15435; AVX2-NEXT: vinsertf128 $1, 416(%r10), %ymm13, %ymm13 15436; AVX2-NEXT: vbroadcastsd %xmm12, %ymm12 15437; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 15438; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15439; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] 15440; AVX2-NEXT: vinsertf128 $1, 448(%rdx), %ymm10, %ymm10 15441; AVX2-NEXT: vbroadcastsd %xmm9, %ymm9 15442; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] 15443; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15444; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] 15445; AVX2-NEXT: vinsertf128 $1, 448(%r10), %ymm7, %ymm7 15446; AVX2-NEXT: vbroadcastsd %xmm6, %ymm6 15447; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] 15448; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15449; AVX2-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 15450; AVX2-NEXT: vinsertf128 $1, 480(%rdx), %ymm4, %ymm4 15451; AVX2-NEXT: vbroadcastsd %xmm3, %ymm3 15452; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] 15453; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15454; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 15455; AVX2-NEXT: vinsertf128 $1, 480(%r10), %ymm1, %ymm1 15456; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 15457; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 15458; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15459; AVX2-NEXT: vmovaps (%rdi), %ymm0 15460; AVX2-NEXT: vmovaps (%rsi), %ymm1 15461; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 15462; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 15463; AVX2-NEXT: vbroadcastsd 16(%rcx), %ymm3 15464; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 15465; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15466; AVX2-NEXT: vmovaps (%r8), %ymm2 15467; AVX2-NEXT: vmovaps (%r9), %ymm3 15468; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 15469; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 15470; AVX2-NEXT: vbroadcastsd 16(%rax), %ymm5 15471; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 15472; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15473; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 15474; AVX2-NEXT: vbroadcastsd 24(%rdx), %ymm1 15475; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] 15476; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 15477; AVX2-NEXT: vbroadcastsd 24(%r10), %ymm1 15478; AVX2-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm1[2,3] 15479; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 15480; AVX2-NEXT: vmovaps 32(%rsi), %ymm1 15481; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 15482; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 15483; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm3 15484; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 15485; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15486; AVX2-NEXT: vmovaps 32(%r8), %ymm2 15487; AVX2-NEXT: vmovaps 32(%r9), %ymm3 15488; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 15489; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 15490; AVX2-NEXT: vbroadcastsd 48(%rax), %ymm5 15491; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 15492; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15493; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 15494; AVX2-NEXT: vbroadcastsd 56(%rdx), %ymm1 15495; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],ymm1[2,3] 15496; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 15497; AVX2-NEXT: vbroadcastsd 56(%r10), %ymm1 15498; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] 15499; AVX2-NEXT: vmovaps 64(%rdi), %ymm0 15500; AVX2-NEXT: vmovaps 64(%rsi), %ymm1 15501; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 15502; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 15503; AVX2-NEXT: vbroadcastsd 80(%rcx), %ymm3 15504; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 15505; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15506; AVX2-NEXT: vmovaps 64(%r8), %ymm2 15507; AVX2-NEXT: vmovaps 64(%r9), %ymm3 15508; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 15509; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 15510; AVX2-NEXT: vbroadcastsd 80(%rax), %ymm5 15511; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 15512; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15513; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 15514; AVX2-NEXT: vbroadcastsd 88(%rdx), %ymm1 15515; AVX2-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] 15516; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 15517; AVX2-NEXT: vbroadcastsd 88(%r10), %ymm1 15518; AVX2-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3] 15519; AVX2-NEXT: vmovaps 96(%rdi), %ymm0 15520; AVX2-NEXT: vmovaps 96(%rsi), %ymm1 15521; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 15522; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 15523; AVX2-NEXT: vbroadcastsd 112(%rcx), %ymm3 15524; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 15525; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15526; AVX2-NEXT: vmovaps 96(%r8), %ymm2 15527; AVX2-NEXT: vmovaps 96(%r9), %ymm3 15528; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 15529; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 15530; AVX2-NEXT: vbroadcastsd 112(%rax), %ymm5 15531; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 15532; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15533; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 15534; AVX2-NEXT: vbroadcastsd 120(%rdx), %ymm1 15535; AVX2-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm1[2,3] 15536; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 15537; AVX2-NEXT: vbroadcastsd 120(%r10), %ymm1 15538; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15539; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15540; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 15541; AVX2-NEXT: vmovaps 128(%rsi), %ymm1 15542; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 15543; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 15544; AVX2-NEXT: vbroadcastsd 144(%rcx), %ymm3 15545; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 15546; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15547; AVX2-NEXT: vmovaps 128(%r8), %ymm2 15548; AVX2-NEXT: vmovaps 128(%r9), %ymm3 15549; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 15550; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 15551; AVX2-NEXT: vbroadcastsd 144(%rax), %ymm5 15552; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 15553; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15554; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 15555; AVX2-NEXT: vbroadcastsd 152(%rdx), %ymm1 15556; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15557; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15558; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 15559; AVX2-NEXT: vbroadcastsd 152(%r10), %ymm1 15560; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15561; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15562; AVX2-NEXT: vmovaps 160(%rdi), %ymm0 15563; AVX2-NEXT: vmovaps 160(%rsi), %ymm1 15564; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 15565; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 15566; AVX2-NEXT: vbroadcastsd 176(%rcx), %ymm3 15567; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 15568; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15569; AVX2-NEXT: vmovaps 160(%r8), %ymm2 15570; AVX2-NEXT: vmovaps 160(%r9), %ymm3 15571; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 15572; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 15573; AVX2-NEXT: vbroadcastsd 176(%rax), %ymm5 15574; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 15575; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15576; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 15577; AVX2-NEXT: vbroadcastsd 184(%rdx), %ymm1 15578; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15579; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15580; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 15581; AVX2-NEXT: vbroadcastsd 184(%r10), %ymm1 15582; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15583; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15584; AVX2-NEXT: vmovaps 192(%rdi), %ymm0 15585; AVX2-NEXT: vmovaps 192(%rsi), %ymm1 15586; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 15587; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 15588; AVX2-NEXT: vbroadcastsd 208(%rcx), %ymm3 15589; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 15590; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15591; AVX2-NEXT: vmovaps 192(%r8), %ymm2 15592; AVX2-NEXT: vmovaps 192(%r9), %ymm3 15593; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 15594; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 15595; AVX2-NEXT: vbroadcastsd 208(%rax), %ymm5 15596; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 15597; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15598; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 15599; AVX2-NEXT: vbroadcastsd 216(%rdx), %ymm1 15600; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15601; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 15602; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 15603; AVX2-NEXT: vbroadcastsd 216(%r10), %ymm1 15604; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15605; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15606; AVX2-NEXT: vmovaps 224(%rdi), %ymm0 15607; AVX2-NEXT: vmovaps 224(%rsi), %ymm1 15608; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 15609; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 15610; AVX2-NEXT: vbroadcastsd 240(%rcx), %ymm3 15611; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 15612; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15613; AVX2-NEXT: vmovaps 224(%r8), %ymm2 15614; AVX2-NEXT: vmovaps 224(%r9), %ymm3 15615; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 15616; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 15617; AVX2-NEXT: vbroadcastsd 240(%rax), %ymm5 15618; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 15619; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15620; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 15621; AVX2-NEXT: vbroadcastsd 248(%rdx), %ymm1 15622; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15623; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15624; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 15625; AVX2-NEXT: vbroadcastsd 248(%r10), %ymm1 15626; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15627; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15628; AVX2-NEXT: vmovaps 256(%rdi), %ymm0 15629; AVX2-NEXT: vmovaps 256(%rsi), %ymm1 15630; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 15631; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 15632; AVX2-NEXT: vbroadcastsd 272(%rcx), %ymm3 15633; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 15634; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15635; AVX2-NEXT: vmovaps 256(%r8), %ymm2 15636; AVX2-NEXT: vmovaps 256(%r9), %ymm3 15637; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 15638; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 15639; AVX2-NEXT: vbroadcastsd 272(%rax), %ymm5 15640; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 15641; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15642; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 15643; AVX2-NEXT: vbroadcastsd 280(%rdx), %ymm1 15644; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15645; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15646; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 15647; AVX2-NEXT: vbroadcastsd 280(%r10), %ymm1 15648; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15649; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15650; AVX2-NEXT: vmovaps 288(%rdi), %ymm0 15651; AVX2-NEXT: vmovaps 288(%rsi), %ymm1 15652; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 15653; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 15654; AVX2-NEXT: vbroadcastsd 304(%rcx), %ymm3 15655; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 15656; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15657; AVX2-NEXT: vmovaps 288(%r8), %ymm2 15658; AVX2-NEXT: vmovaps 288(%r9), %ymm3 15659; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 15660; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 15661; AVX2-NEXT: vbroadcastsd 304(%rax), %ymm5 15662; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 15663; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15664; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 15665; AVX2-NEXT: vbroadcastsd 312(%rdx), %ymm1 15666; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15667; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15668; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 15669; AVX2-NEXT: vbroadcastsd 312(%r10), %ymm1 15670; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15671; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15672; AVX2-NEXT: vmovaps 320(%rdi), %ymm0 15673; AVX2-NEXT: vmovaps 320(%rsi), %ymm1 15674; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 15675; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 15676; AVX2-NEXT: vbroadcastsd 336(%rcx), %ymm3 15677; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 15678; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15679; AVX2-NEXT: vmovaps 320(%r8), %ymm2 15680; AVX2-NEXT: vmovaps 320(%r9), %ymm3 15681; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 15682; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 15683; AVX2-NEXT: vbroadcastsd 336(%rax), %ymm5 15684; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 15685; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15686; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 15687; AVX2-NEXT: vbroadcastsd 344(%rdx), %ymm1 15688; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15689; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15690; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 15691; AVX2-NEXT: vbroadcastsd 344(%r10), %ymm1 15692; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15693; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15694; AVX2-NEXT: vmovaps 352(%rdi), %ymm0 15695; AVX2-NEXT: vmovaps 352(%rsi), %ymm1 15696; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 15697; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 15698; AVX2-NEXT: vbroadcastsd 368(%rcx), %ymm3 15699; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 15700; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15701; AVX2-NEXT: vmovaps 352(%r8), %ymm2 15702; AVX2-NEXT: vmovaps 352(%r9), %ymm3 15703; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 15704; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 15705; AVX2-NEXT: vbroadcastsd 368(%rax), %ymm5 15706; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 15707; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15708; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 15709; AVX2-NEXT: vbroadcastsd 376(%rdx), %ymm1 15710; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15711; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15712; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 15713; AVX2-NEXT: vbroadcastsd 376(%r10), %ymm1 15714; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15715; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15716; AVX2-NEXT: vmovaps 384(%rdi), %ymm0 15717; AVX2-NEXT: vmovaps 384(%rsi), %ymm1 15718; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 15719; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 15720; AVX2-NEXT: vbroadcastsd 400(%rcx), %ymm3 15721; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 15722; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15723; AVX2-NEXT: vmovaps 384(%r8), %ymm2 15724; AVX2-NEXT: vmovaps 384(%r9), %ymm3 15725; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 15726; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 15727; AVX2-NEXT: vbroadcastsd 400(%rax), %ymm5 15728; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 15729; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15730; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 15731; AVX2-NEXT: vbroadcastsd 408(%rdx), %ymm1 15732; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15733; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15734; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 15735; AVX2-NEXT: vbroadcastsd 408(%r10), %ymm1 15736; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15737; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15738; AVX2-NEXT: vmovaps 416(%rdi), %ymm0 15739; AVX2-NEXT: vmovaps 416(%rsi), %ymm1 15740; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 15741; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 15742; AVX2-NEXT: vbroadcastsd 432(%rcx), %ymm3 15743; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 15744; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15745; AVX2-NEXT: vmovaps 416(%r8), %ymm2 15746; AVX2-NEXT: vmovaps 416(%r9), %ymm3 15747; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 15748; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 15749; AVX2-NEXT: vbroadcastsd 432(%rax), %ymm5 15750; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 15751; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15752; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 15753; AVX2-NEXT: vbroadcastsd 440(%rdx), %ymm1 15754; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15755; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15756; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 15757; AVX2-NEXT: vbroadcastsd 440(%r10), %ymm1 15758; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15759; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15760; AVX2-NEXT: vmovaps 448(%rdi), %ymm0 15761; AVX2-NEXT: vmovaps 448(%rsi), %ymm1 15762; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 15763; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 15764; AVX2-NEXT: vbroadcastsd 464(%rcx), %ymm3 15765; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 15766; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15767; AVX2-NEXT: vmovaps 448(%r8), %ymm2 15768; AVX2-NEXT: vmovaps 448(%r9), %ymm3 15769; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 15770; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 15771; AVX2-NEXT: vbroadcastsd 464(%rax), %ymm5 15772; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 15773; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15774; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 15775; AVX2-NEXT: vbroadcastsd 472(%rdx), %ymm1 15776; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15777; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15778; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 15779; AVX2-NEXT: vbroadcastsd 472(%r10), %ymm1 15780; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 15781; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15782; AVX2-NEXT: vmovaps 480(%rdi), %ymm0 15783; AVX2-NEXT: vmovaps 480(%rsi), %ymm1 15784; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 15785; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 15786; AVX2-NEXT: vbroadcastsd 496(%rcx), %ymm3 15787; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 15788; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15789; AVX2-NEXT: vmovaps 480(%r8), %ymm3 15790; AVX2-NEXT: vmovaps 480(%r9), %ymm4 15791; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] 15792; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 15793; AVX2-NEXT: vbroadcastsd 496(%rax), %ymm5 15794; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm5[6,7] 15795; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 15796; AVX2-NEXT: vbroadcastsd 504(%rdx), %ymm1 15797; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] 15798; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] 15799; AVX2-NEXT: vbroadcastsd 504(%r10), %ymm1 15800; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] 15801; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx 15802; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] 15803; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15804; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],mem[6,7] 15805; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15806; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],mem[6,7] 15807; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15808; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] 15809; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15810; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],mem[6,7] 15811; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15812; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] 15813; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15814; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],mem[6,7] 15815; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15816; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15817; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 15818; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15819; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15820; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 15821; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15822; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15823; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 15824; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15825; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15826; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 15827; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15828; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15829; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 15830; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15831; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 15832; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 15833; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 15834; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15835; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 15836; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15837; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15838; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 15839; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15840; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15841; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 15842; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15843; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15844; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 15845; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15846; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15847; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] 15848; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15849; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],mem[6,7] 15850; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15851; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],mem[6,7] 15852; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15853; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],mem[6,7] 15854; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15855; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],mem[6,7] 15856; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15857; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] 15858; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15859; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] 15860; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15861; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],mem[6,7] 15862; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15863; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] 15864; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15865; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] 15866; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15867; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] 15868; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15869; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] 15870; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15871; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 15872; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] 15873; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] 15874; AVX2-NEXT: vmovaps %ymm9, 4064(%rdx) 15875; AVX2-NEXT: vmovaps %ymm11, 4032(%rdx) 15876; AVX2-NEXT: vmovaps %ymm15, 4000(%rdx) 15877; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 15878; AVX2-NEXT: vmovaps %ymm9, 3968(%rdx) 15879; AVX2-NEXT: vmovaps %ymm0, 3808(%rdx) 15880; AVX2-NEXT: vmovaps %ymm1, 3776(%rdx) 15881; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15882; AVX2-NEXT: vmovaps %ymm0, 3744(%rdx) 15883; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15884; AVX2-NEXT: vmovaps %ymm0, 3712(%rdx) 15885; AVX2-NEXT: vmovaps %ymm2, 3552(%rdx) 15886; AVX2-NEXT: vmovaps %ymm3, 3520(%rdx) 15887; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15888; AVX2-NEXT: vmovaps %ymm0, 3488(%rdx) 15889; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15890; AVX2-NEXT: vmovaps %ymm0, 3456(%rdx) 15891; AVX2-NEXT: vmovaps %ymm4, 3296(%rdx) 15892; AVX2-NEXT: vmovaps %ymm5, 3264(%rdx) 15893; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15894; AVX2-NEXT: vmovaps %ymm0, 3232(%rdx) 15895; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15896; AVX2-NEXT: vmovaps %ymm0, 3200(%rdx) 15897; AVX2-NEXT: vmovaps %ymm6, 3040(%rdx) 15898; AVX2-NEXT: vmovaps %ymm7, 3008(%rdx) 15899; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15900; AVX2-NEXT: vmovaps %ymm0, 2976(%rdx) 15901; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15902; AVX2-NEXT: vmovaps %ymm0, 2944(%rdx) 15903; AVX2-NEXT: vmovaps %ymm8, 2784(%rdx) 15904; AVX2-NEXT: vmovaps %ymm10, 2752(%rdx) 15905; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15906; AVX2-NEXT: vmovaps %ymm0, 2720(%rdx) 15907; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15908; AVX2-NEXT: vmovaps %ymm0, 2688(%rdx) 15909; AVX2-NEXT: vmovaps %ymm12, 2528(%rdx) 15910; AVX2-NEXT: vmovaps %ymm13, 2496(%rdx) 15911; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15912; AVX2-NEXT: vmovaps %ymm0, 2464(%rdx) 15913; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15914; AVX2-NEXT: vmovaps %ymm0, 2432(%rdx) 15915; AVX2-NEXT: vmovaps %ymm14, 2272(%rdx) 15916; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15917; AVX2-NEXT: vmovaps %ymm0, 2240(%rdx) 15918; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15919; AVX2-NEXT: vmovaps %ymm0, 2208(%rdx) 15920; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15921; AVX2-NEXT: vmovaps %ymm0, 2176(%rdx) 15922; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15923; AVX2-NEXT: vmovaps %ymm0, 2016(%rdx) 15924; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15925; AVX2-NEXT: vmovaps %ymm0, 1984(%rdx) 15926; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15927; AVX2-NEXT: vmovaps %ymm0, 1952(%rdx) 15928; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15929; AVX2-NEXT: vmovaps %ymm0, 1920(%rdx) 15930; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15931; AVX2-NEXT: vmovaps %ymm0, 1760(%rdx) 15932; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 15933; AVX2-NEXT: vmovaps %ymm0, 1728(%rdx) 15934; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15935; AVX2-NEXT: vmovaps %ymm0, 1696(%rdx) 15936; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15937; AVX2-NEXT: vmovaps %ymm0, 1664(%rdx) 15938; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15939; AVX2-NEXT: vmovaps %ymm0, 1504(%rdx) 15940; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15941; AVX2-NEXT: vmovaps %ymm0, 1472(%rdx) 15942; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15943; AVX2-NEXT: vmovaps %ymm0, 1440(%rdx) 15944; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15945; AVX2-NEXT: vmovaps %ymm0, 1408(%rdx) 15946; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15947; AVX2-NEXT: vmovaps %ymm0, 1248(%rdx) 15948; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15949; AVX2-NEXT: vmovaps %ymm0, 1216(%rdx) 15950; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15951; AVX2-NEXT: vmovaps %ymm0, 1184(%rdx) 15952; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15953; AVX2-NEXT: vmovaps %ymm0, 1152(%rdx) 15954; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15955; AVX2-NEXT: vmovaps %ymm0, 992(%rdx) 15956; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15957; AVX2-NEXT: vmovaps %ymm0, 960(%rdx) 15958; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15959; AVX2-NEXT: vmovaps %ymm0, 928(%rdx) 15960; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15961; AVX2-NEXT: vmovaps %ymm0, 896(%rdx) 15962; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15963; AVX2-NEXT: vmovaps %ymm0, 736(%rdx) 15964; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15965; AVX2-NEXT: vmovaps %ymm0, 704(%rdx) 15966; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15967; AVX2-NEXT: vmovaps %ymm0, 672(%rdx) 15968; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15969; AVX2-NEXT: vmovaps %ymm0, 640(%rdx) 15970; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15971; AVX2-NEXT: vmovaps %ymm0, 480(%rdx) 15972; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15973; AVX2-NEXT: vmovaps %ymm0, 448(%rdx) 15974; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15975; AVX2-NEXT: vmovaps %ymm0, 416(%rdx) 15976; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15977; AVX2-NEXT: vmovaps %ymm0, 384(%rdx) 15978; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15979; AVX2-NEXT: vmovaps %ymm0, 224(%rdx) 15980; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15981; AVX2-NEXT: vmovaps %ymm0, 192(%rdx) 15982; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15983; AVX2-NEXT: vmovaps %ymm0, 160(%rdx) 15984; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15985; AVX2-NEXT: vmovaps %ymm0, 128(%rdx) 15986; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15987; AVX2-NEXT: vmovaps %ymm0, 3936(%rdx) 15988; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15989; AVX2-NEXT: vmovaps %ymm0, 3904(%rdx) 15990; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15991; AVX2-NEXT: vmovaps %ymm0, 3872(%rdx) 15992; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15993; AVX2-NEXT: vmovaps %ymm0, 3840(%rdx) 15994; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15995; AVX2-NEXT: vmovaps %ymm0, 3680(%rdx) 15996; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15997; AVX2-NEXT: vmovaps %ymm0, 3648(%rdx) 15998; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15999; AVX2-NEXT: vmovaps %ymm0, 3616(%rdx) 16000; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16001; AVX2-NEXT: vmovaps %ymm0, 3584(%rdx) 16002; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16003; AVX2-NEXT: vmovaps %ymm0, 3424(%rdx) 16004; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16005; AVX2-NEXT: vmovaps %ymm0, 3392(%rdx) 16006; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16007; AVX2-NEXT: vmovaps %ymm0, 3360(%rdx) 16008; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16009; AVX2-NEXT: vmovaps %ymm0, 3328(%rdx) 16010; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16011; AVX2-NEXT: vmovaps %ymm0, 3168(%rdx) 16012; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16013; AVX2-NEXT: vmovaps %ymm0, 3136(%rdx) 16014; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16015; AVX2-NEXT: vmovaps %ymm0, 3104(%rdx) 16016; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16017; AVX2-NEXT: vmovaps %ymm0, 3072(%rdx) 16018; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16019; AVX2-NEXT: vmovaps %ymm0, 2912(%rdx) 16020; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16021; AVX2-NEXT: vmovaps %ymm0, 2880(%rdx) 16022; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16023; AVX2-NEXT: vmovaps %ymm0, 2848(%rdx) 16024; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16025; AVX2-NEXT: vmovaps %ymm0, 2816(%rdx) 16026; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16027; AVX2-NEXT: vmovaps %ymm0, 2656(%rdx) 16028; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16029; AVX2-NEXT: vmovaps %ymm0, 2624(%rdx) 16030; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16031; AVX2-NEXT: vmovaps %ymm0, 2592(%rdx) 16032; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16033; AVX2-NEXT: vmovaps %ymm0, 2560(%rdx) 16034; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16035; AVX2-NEXT: vmovaps %ymm0, 2400(%rdx) 16036; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16037; AVX2-NEXT: vmovaps %ymm0, 2368(%rdx) 16038; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16039; AVX2-NEXT: vmovaps %ymm0, 2336(%rdx) 16040; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16041; AVX2-NEXT: vmovaps %ymm0, 2304(%rdx) 16042; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16043; AVX2-NEXT: vmovaps %ymm0, 2144(%rdx) 16044; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16045; AVX2-NEXT: vmovaps %ymm0, 2112(%rdx) 16046; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16047; AVX2-NEXT: vmovaps %ymm0, 2080(%rdx) 16048; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16049; AVX2-NEXT: vmovaps %ymm0, 2048(%rdx) 16050; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16051; AVX2-NEXT: vmovaps %ymm0, 1888(%rdx) 16052; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16053; AVX2-NEXT: vmovaps %ymm0, 1856(%rdx) 16054; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16055; AVX2-NEXT: vmovaps %ymm0, 1824(%rdx) 16056; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16057; AVX2-NEXT: vmovaps %ymm0, 1792(%rdx) 16058; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16059; AVX2-NEXT: vmovaps %ymm0, 1632(%rdx) 16060; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16061; AVX2-NEXT: vmovaps %ymm0, 1600(%rdx) 16062; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16063; AVX2-NEXT: vmovaps %ymm0, 1568(%rdx) 16064; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16065; AVX2-NEXT: vmovaps %ymm0, 1536(%rdx) 16066; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16067; AVX2-NEXT: vmovaps %ymm0, 1376(%rdx) 16068; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16069; AVX2-NEXT: vmovaps %ymm0, 1344(%rdx) 16070; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16071; AVX2-NEXT: vmovaps %ymm0, 1312(%rdx) 16072; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16073; AVX2-NEXT: vmovaps %ymm0, 1280(%rdx) 16074; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16075; AVX2-NEXT: vmovaps %ymm0, 1120(%rdx) 16076; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16077; AVX2-NEXT: vmovaps %ymm0, 1088(%rdx) 16078; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16079; AVX2-NEXT: vmovaps %ymm0, 1056(%rdx) 16080; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16081; AVX2-NEXT: vmovaps %ymm0, 1024(%rdx) 16082; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16083; AVX2-NEXT: vmovaps %ymm0, 864(%rdx) 16084; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16085; AVX2-NEXT: vmovaps %ymm0, 832(%rdx) 16086; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16087; AVX2-NEXT: vmovaps %ymm0, 800(%rdx) 16088; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16089; AVX2-NEXT: vmovaps %ymm0, 768(%rdx) 16090; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16091; AVX2-NEXT: vmovaps %ymm0, 608(%rdx) 16092; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16093; AVX2-NEXT: vmovaps %ymm0, 576(%rdx) 16094; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16095; AVX2-NEXT: vmovaps %ymm0, 544(%rdx) 16096; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16097; AVX2-NEXT: vmovaps %ymm0, 512(%rdx) 16098; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16099; AVX2-NEXT: vmovaps %ymm0, 352(%rdx) 16100; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16101; AVX2-NEXT: vmovaps %ymm0, 320(%rdx) 16102; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16103; AVX2-NEXT: vmovaps %ymm0, 288(%rdx) 16104; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16105; AVX2-NEXT: vmovaps %ymm0, 256(%rdx) 16106; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16107; AVX2-NEXT: vmovaps %ymm0, 96(%rdx) 16108; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16109; AVX2-NEXT: vmovaps %ymm0, 64(%rdx) 16110; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16111; AVX2-NEXT: vmovaps %ymm0, 32(%rdx) 16112; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 16113; AVX2-NEXT: vmovaps %ymm0, (%rdx) 16114; AVX2-NEXT: addq $3880, %rsp # imm = 0xF28 16115; AVX2-NEXT: vzeroupper 16116; AVX2-NEXT: retq 16117; 16118; AVX2-FP-LABEL: store_i64_stride8_vf64: 16119; AVX2-FP: # %bb.0: 16120; AVX2-FP-NEXT: subq $3880, %rsp # imm = 0xF28 16121; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 16122; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 16123; AVX2-FP-NEXT: vmovaps (%rcx), %xmm0 16124; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16125; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 16126; AVX2-FP-NEXT: vmovaps (%rsi), %xmm2 16127; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16128; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm3 16129; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16130; AVX2-FP-NEXT: vmovaps (%rdi), %xmm1 16131; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16132; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm4 16133; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16134; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 16135; AVX2-FP-NEXT: vbroadcastsd 8(%rdx), %ymm2 16136; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 16137; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 16138; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16139; AVX2-FP-NEXT: vmovaps (%rax), %xmm0 16140; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16141; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 16142; AVX2-FP-NEXT: vmovaps (%r9), %xmm2 16143; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16144; AVX2-FP-NEXT: vmovaps (%r8), %xmm1 16145; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16146; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 16147; AVX2-FP-NEXT: vbroadcastsd 8(%r10), %ymm2 16148; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 16149; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 16150; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16151; AVX2-FP-NEXT: vmovaps 32(%rcx), %xmm0 16152; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16153; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 16154; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1] 16155; AVX2-FP-NEXT: vbroadcastsd 40(%rdx), %ymm2 16156; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 16157; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 16158; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16159; AVX2-FP-NEXT: vmovaps 32(%r9), %xmm1 16160; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16161; AVX2-FP-NEXT: vmovaps 32(%r8), %xmm0 16162; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16163; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16164; AVX2-FP-NEXT: vbroadcastsd 40(%r10), %ymm1 16165; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16166; AVX2-FP-NEXT: vmovaps 32(%rax), %xmm1 16167; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16168; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16169; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16170; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16171; AVX2-FP-NEXT: vmovaps 64(%rsi), %xmm1 16172; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16173; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm0 16174; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16175; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16176; AVX2-FP-NEXT: vbroadcastsd 72(%rdx), %ymm1 16177; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16178; AVX2-FP-NEXT: vmovaps 64(%rcx), %xmm1 16179; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16180; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16181; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16182; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16183; AVX2-FP-NEXT: vmovaps 64(%r9), %xmm1 16184; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16185; AVX2-FP-NEXT: vmovaps 64(%r8), %xmm0 16186; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16187; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16188; AVX2-FP-NEXT: vbroadcastsd 72(%r10), %ymm1 16189; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16190; AVX2-FP-NEXT: vmovaps 64(%rax), %xmm1 16191; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16192; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16193; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16194; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16195; AVX2-FP-NEXT: vmovaps 96(%rsi), %xmm1 16196; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16197; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm0 16198; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16199; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16200; AVX2-FP-NEXT: vbroadcastsd 104(%rdx), %ymm1 16201; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16202; AVX2-FP-NEXT: vmovaps 96(%rcx), %xmm1 16203; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16204; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16205; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16206; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16207; AVX2-FP-NEXT: vmovaps 96(%r9), %xmm1 16208; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16209; AVX2-FP-NEXT: vmovaps 96(%r8), %xmm0 16210; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16211; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16212; AVX2-FP-NEXT: vbroadcastsd 104(%r10), %ymm1 16213; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16214; AVX2-FP-NEXT: vmovaps 96(%rax), %xmm1 16215; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16216; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16217; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16218; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16219; AVX2-FP-NEXT: vmovaps 128(%rsi), %xmm1 16220; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16221; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm0 16222; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16223; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16224; AVX2-FP-NEXT: vbroadcastsd 136(%rdx), %ymm1 16225; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16226; AVX2-FP-NEXT: vmovaps 128(%rcx), %xmm1 16227; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16228; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16229; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16230; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16231; AVX2-FP-NEXT: vmovaps 128(%r9), %xmm1 16232; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16233; AVX2-FP-NEXT: vmovaps 128(%r8), %xmm0 16234; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16235; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16236; AVX2-FP-NEXT: vbroadcastsd 136(%r10), %ymm1 16237; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16238; AVX2-FP-NEXT: vmovaps 128(%rax), %xmm1 16239; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16240; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16241; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16242; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16243; AVX2-FP-NEXT: vmovaps 160(%rsi), %xmm1 16244; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16245; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm0 16246; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16247; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16248; AVX2-FP-NEXT: vbroadcastsd 168(%rdx), %ymm1 16249; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16250; AVX2-FP-NEXT: vmovaps 160(%rcx), %xmm1 16251; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16252; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16253; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16254; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16255; AVX2-FP-NEXT: vmovaps 160(%r9), %xmm1 16256; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16257; AVX2-FP-NEXT: vmovaps 160(%r8), %xmm0 16258; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16259; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16260; AVX2-FP-NEXT: vbroadcastsd 168(%r10), %ymm1 16261; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16262; AVX2-FP-NEXT: vmovaps 160(%rax), %xmm1 16263; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16264; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16265; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16266; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16267; AVX2-FP-NEXT: vmovaps 192(%rsi), %xmm1 16268; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16269; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm0 16270; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16271; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16272; AVX2-FP-NEXT: vbroadcastsd 200(%rdx), %ymm1 16273; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16274; AVX2-FP-NEXT: vmovaps 192(%rcx), %xmm1 16275; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16276; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16277; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16278; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16279; AVX2-FP-NEXT: vmovaps 192(%r9), %xmm1 16280; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16281; AVX2-FP-NEXT: vmovaps 192(%r8), %xmm0 16282; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16283; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16284; AVX2-FP-NEXT: vbroadcastsd 200(%r10), %ymm1 16285; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16286; AVX2-FP-NEXT: vmovaps 192(%rax), %xmm1 16287; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16288; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16289; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16290; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16291; AVX2-FP-NEXT: vmovaps 224(%rsi), %xmm1 16292; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16293; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm0 16294; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16295; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16296; AVX2-FP-NEXT: vbroadcastsd 232(%rdx), %ymm1 16297; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16298; AVX2-FP-NEXT: vmovaps 224(%rcx), %xmm1 16299; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16300; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16301; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16302; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16303; AVX2-FP-NEXT: vmovaps 224(%r9), %xmm1 16304; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16305; AVX2-FP-NEXT: vmovaps 224(%r8), %xmm0 16306; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16307; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16308; AVX2-FP-NEXT: vbroadcastsd 232(%r10), %ymm1 16309; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16310; AVX2-FP-NEXT: vmovaps 224(%rax), %xmm1 16311; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16312; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16313; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16314; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16315; AVX2-FP-NEXT: vmovaps 256(%rsi), %xmm1 16316; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16317; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm0 16318; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16319; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16320; AVX2-FP-NEXT: vbroadcastsd 264(%rdx), %ymm1 16321; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16322; AVX2-FP-NEXT: vmovaps 256(%rcx), %xmm1 16323; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16324; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16325; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16326; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16327; AVX2-FP-NEXT: vmovaps 256(%r9), %xmm1 16328; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16329; AVX2-FP-NEXT: vmovaps 256(%r8), %xmm0 16330; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16331; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16332; AVX2-FP-NEXT: vbroadcastsd 264(%r10), %ymm1 16333; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16334; AVX2-FP-NEXT: vmovaps 256(%rax), %xmm1 16335; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16336; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16337; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16338; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16339; AVX2-FP-NEXT: vmovaps 288(%rsi), %xmm1 16340; AVX2-FP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill 16341; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm0 16342; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16343; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16344; AVX2-FP-NEXT: vbroadcastsd 296(%rdx), %ymm1 16345; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16346; AVX2-FP-NEXT: vmovaps 288(%rcx), %xmm1 16347; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16348; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16349; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16350; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16351; AVX2-FP-NEXT: vmovaps 288(%r9), %xmm1 16352; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16353; AVX2-FP-NEXT: vmovaps 288(%r8), %xmm0 16354; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16355; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16356; AVX2-FP-NEXT: vbroadcastsd 296(%r10), %ymm1 16357; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16358; AVX2-FP-NEXT: vmovaps 288(%rax), %xmm1 16359; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16360; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16361; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16362; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16363; AVX2-FP-NEXT: vmovaps 320(%rsi), %xmm1 16364; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16365; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm0 16366; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16367; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16368; AVX2-FP-NEXT: vbroadcastsd 328(%rdx), %ymm1 16369; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16370; AVX2-FP-NEXT: vmovaps 320(%rcx), %xmm1 16371; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16372; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16373; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16374; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16375; AVX2-FP-NEXT: vmovaps 320(%r9), %xmm1 16376; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16377; AVX2-FP-NEXT: vmovaps 320(%r8), %xmm0 16378; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16379; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16380; AVX2-FP-NEXT: vbroadcastsd 328(%r10), %ymm1 16381; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16382; AVX2-FP-NEXT: vmovaps 320(%rax), %xmm1 16383; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16384; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16385; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16386; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16387; AVX2-FP-NEXT: vmovaps 352(%rsi), %xmm1 16388; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16389; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm0 16390; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16391; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16392; AVX2-FP-NEXT: vbroadcastsd 360(%rdx), %ymm1 16393; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16394; AVX2-FP-NEXT: vmovaps 352(%rcx), %xmm1 16395; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16396; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16397; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16398; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16399; AVX2-FP-NEXT: vmovaps 352(%r9), %xmm1 16400; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16401; AVX2-FP-NEXT: vmovaps 352(%r8), %xmm0 16402; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16403; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16404; AVX2-FP-NEXT: vbroadcastsd 360(%r10), %ymm1 16405; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16406; AVX2-FP-NEXT: vmovaps 352(%rax), %xmm1 16407; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16408; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16409; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16410; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16411; AVX2-FP-NEXT: vmovaps 384(%rsi), %xmm1 16412; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16413; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm0 16414; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16415; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16416; AVX2-FP-NEXT: vbroadcastsd 392(%rdx), %ymm1 16417; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16418; AVX2-FP-NEXT: vmovaps 384(%rcx), %xmm1 16419; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16420; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16421; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16422; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16423; AVX2-FP-NEXT: vmovaps 384(%r9), %xmm1 16424; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16425; AVX2-FP-NEXT: vmovaps 384(%r8), %xmm0 16426; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16427; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16428; AVX2-FP-NEXT: vbroadcastsd 392(%r10), %ymm1 16429; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16430; AVX2-FP-NEXT: vmovaps 384(%rax), %xmm1 16431; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16432; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16433; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16434; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16435; AVX2-FP-NEXT: vmovaps 416(%rsi), %xmm1 16436; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16437; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm0 16438; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16439; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 16440; AVX2-FP-NEXT: vbroadcastsd 424(%rdx), %ymm1 16441; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16442; AVX2-FP-NEXT: vmovaps 416(%rcx), %xmm1 16443; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16444; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 16445; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16446; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16447; AVX2-FP-NEXT: vmovaps 416(%r9), %xmm0 16448; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16449; AVX2-FP-NEXT: vmovaps 416(%r8), %xmm13 16450; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] 16451; AVX2-FP-NEXT: vbroadcastsd 424(%r10), %ymm1 16452; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16453; AVX2-FP-NEXT: vmovaps 416(%rax), %xmm12 16454; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 16455; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16456; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16457; AVX2-FP-NEXT: vmovaps 448(%rsi), %xmm11 16458; AVX2-FP-NEXT: vmovaps 448(%rdi), %xmm10 16459; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] 16460; AVX2-FP-NEXT: vbroadcastsd 456(%rdx), %ymm1 16461; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16462; AVX2-FP-NEXT: vmovaps 448(%rcx), %xmm9 16463; AVX2-FP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 16464; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16465; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16466; AVX2-FP-NEXT: vmovaps 448(%r9), %xmm8 16467; AVX2-FP-NEXT: vmovaps 448(%r8), %xmm7 16468; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] 16469; AVX2-FP-NEXT: vbroadcastsd 456(%r10), %ymm1 16470; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16471; AVX2-FP-NEXT: vmovaps 448(%rax), %xmm6 16472; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 16473; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16474; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16475; AVX2-FP-NEXT: vmovaps 480(%rsi), %xmm5 16476; AVX2-FP-NEXT: vmovaps 480(%rdi), %xmm4 16477; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] 16478; AVX2-FP-NEXT: vbroadcastsd 488(%rdx), %ymm1 16479; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16480; AVX2-FP-NEXT: vmovaps 480(%rcx), %xmm3 16481; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 16482; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 16483; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16484; AVX2-FP-NEXT: vmovaps 480(%r9), %xmm2 16485; AVX2-FP-NEXT: vmovaps 480(%r8), %xmm1 16486; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] 16487; AVX2-FP-NEXT: vbroadcastsd 488(%r10), %ymm15 16488; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] 16489; AVX2-FP-NEXT: vmovaps 480(%rax), %xmm0 16490; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 16491; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] 16492; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16493; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16494; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16495; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16496; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm14, %ymm14 16497; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16498; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16499; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16500; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16501; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16502; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16503; AVX2-FP-NEXT: vinsertf128 $1, (%r10), %ymm14, %ymm14 16504; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16505; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16506; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16507; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16508; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16509; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16510; AVX2-FP-NEXT: vinsertf128 $1, 32(%rdx), %ymm14, %ymm14 16511; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16512; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16513; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16514; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16515; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16516; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16517; AVX2-FP-NEXT: vinsertf128 $1, 32(%r10), %ymm14, %ymm14 16518; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16519; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16520; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16521; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16522; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16523; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16524; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdx), %ymm14, %ymm14 16525; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16526; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16527; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16528; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16529; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16530; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16531; AVX2-FP-NEXT: vinsertf128 $1, 64(%r10), %ymm14, %ymm14 16532; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16533; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16534; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16535; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16536; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16537; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16538; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdx), %ymm14, %ymm14 16539; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16540; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16541; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16542; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16543; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16544; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16545; AVX2-FP-NEXT: vinsertf128 $1, 96(%r10), %ymm14, %ymm14 16546; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16547; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16548; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16549; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16550; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16551; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16552; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdx), %ymm14, %ymm14 16553; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16554; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16555; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16556; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16557; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16558; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16559; AVX2-FP-NEXT: vinsertf128 $1, 128(%r10), %ymm14, %ymm14 16560; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16561; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16562; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16563; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16564; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16565; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16566; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdx), %ymm14, %ymm14 16567; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16568; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16569; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16570; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16571; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16572; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16573; AVX2-FP-NEXT: vinsertf128 $1, 160(%r10), %ymm14, %ymm14 16574; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16575; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16576; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16577; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16578; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16579; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16580; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdx), %ymm14, %ymm14 16581; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16582; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16583; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16584; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16585; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16586; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16587; AVX2-FP-NEXT: vinsertf128 $1, 192(%r10), %ymm14, %ymm14 16588; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16589; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16590; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16591; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16592; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16593; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16594; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdx), %ymm14, %ymm14 16595; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16596; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16597; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16598; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16599; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16600; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16601; AVX2-FP-NEXT: vinsertf128 $1, 224(%r10), %ymm14, %ymm14 16602; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16603; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16604; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16605; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16606; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16607; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16608; AVX2-FP-NEXT: vinsertf128 $1, 256(%rdx), %ymm14, %ymm14 16609; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16610; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16611; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16612; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16613; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16614; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16615; AVX2-FP-NEXT: vinsertf128 $1, 256(%r10), %ymm14, %ymm14 16616; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16617; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16618; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16619; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16620; AVX2-FP-NEXT: vunpcklpd (%rsp), %xmm14, %xmm14 # 16-byte Folded Reload 16621; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16622; AVX2-FP-NEXT: vinsertf128 $1, 288(%rdx), %ymm14, %ymm14 16623; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16624; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16625; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16626; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16627; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16628; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16629; AVX2-FP-NEXT: vinsertf128 $1, 288(%r10), %ymm14, %ymm14 16630; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16631; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16632; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16633; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16634; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16635; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16636; AVX2-FP-NEXT: vinsertf128 $1, 320(%rdx), %ymm14, %ymm14 16637; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16638; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16639; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16640; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16641; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16642; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16643; AVX2-FP-NEXT: vinsertf128 $1, 320(%r10), %ymm14, %ymm14 16644; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16645; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16646; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16647; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16648; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16649; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16650; AVX2-FP-NEXT: vinsertf128 $1, 352(%rdx), %ymm14, %ymm14 16651; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16652; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16653; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16654; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16655; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16656; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16657; AVX2-FP-NEXT: vinsertf128 $1, 352(%r10), %ymm14, %ymm14 16658; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16659; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16660; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16661; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16662; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16663; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16664; AVX2-FP-NEXT: vinsertf128 $1, 384(%rdx), %ymm14, %ymm14 16665; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16666; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16667; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16668; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16669; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16670; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16671; AVX2-FP-NEXT: vinsertf128 $1, 384(%r10), %ymm14, %ymm14 16672; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16673; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16674; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16675; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 16676; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 16677; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] 16678; AVX2-FP-NEXT: vinsertf128 $1, 416(%rdx), %ymm14, %ymm14 16679; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 16680; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 16681; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16682; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 16683; AVX2-FP-NEXT: # xmm13 = xmm13[0],mem[0] 16684; AVX2-FP-NEXT: vinsertf128 $1, 416(%r10), %ymm13, %ymm13 16685; AVX2-FP-NEXT: vbroadcastsd %xmm12, %ymm12 16686; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 16687; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16688; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] 16689; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdx), %ymm10, %ymm10 16690; AVX2-FP-NEXT: vbroadcastsd %xmm9, %ymm9 16691; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] 16692; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16693; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] 16694; AVX2-FP-NEXT: vinsertf128 $1, 448(%r10), %ymm7, %ymm7 16695; AVX2-FP-NEXT: vbroadcastsd %xmm6, %ymm6 16696; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] 16697; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16698; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 16699; AVX2-FP-NEXT: vinsertf128 $1, 480(%rdx), %ymm4, %ymm4 16700; AVX2-FP-NEXT: vbroadcastsd %xmm3, %ymm3 16701; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] 16702; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16703; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 16704; AVX2-FP-NEXT: vinsertf128 $1, 480(%r10), %ymm1, %ymm1 16705; AVX2-FP-NEXT: vbroadcastsd %xmm0, %ymm0 16706; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 16707; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16708; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 16709; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1 16710; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 16711; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 16712; AVX2-FP-NEXT: vbroadcastsd 16(%rcx), %ymm3 16713; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 16714; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16715; AVX2-FP-NEXT: vmovaps (%r8), %ymm2 16716; AVX2-FP-NEXT: vmovaps (%r9), %ymm3 16717; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 16718; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 16719; AVX2-FP-NEXT: vbroadcastsd 16(%rax), %ymm5 16720; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 16721; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16722; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 16723; AVX2-FP-NEXT: vbroadcastsd 24(%rdx), %ymm1 16724; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] 16725; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 16726; AVX2-FP-NEXT: vbroadcastsd 24(%r10), %ymm1 16727; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm1[2,3] 16728; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0 16729; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm1 16730; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 16731; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 16732; AVX2-FP-NEXT: vbroadcastsd 48(%rcx), %ymm3 16733; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 16734; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16735; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm2 16736; AVX2-FP-NEXT: vmovaps 32(%r9), %ymm3 16737; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 16738; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 16739; AVX2-FP-NEXT: vbroadcastsd 48(%rax), %ymm5 16740; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 16741; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16742; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 16743; AVX2-FP-NEXT: vbroadcastsd 56(%rdx), %ymm1 16744; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],ymm1[2,3] 16745; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 16746; AVX2-FP-NEXT: vbroadcastsd 56(%r10), %ymm1 16747; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] 16748; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm0 16749; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm1 16750; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 16751; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 16752; AVX2-FP-NEXT: vbroadcastsd 80(%rcx), %ymm3 16753; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 16754; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16755; AVX2-FP-NEXT: vmovaps 64(%r8), %ymm2 16756; AVX2-FP-NEXT: vmovaps 64(%r9), %ymm3 16757; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 16758; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 16759; AVX2-FP-NEXT: vbroadcastsd 80(%rax), %ymm5 16760; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 16761; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16762; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 16763; AVX2-FP-NEXT: vbroadcastsd 88(%rdx), %ymm1 16764; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] 16765; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 16766; AVX2-FP-NEXT: vbroadcastsd 88(%r10), %ymm1 16767; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3] 16768; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm0 16769; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm1 16770; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 16771; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 16772; AVX2-FP-NEXT: vbroadcastsd 112(%rcx), %ymm3 16773; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 16774; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16775; AVX2-FP-NEXT: vmovaps 96(%r8), %ymm2 16776; AVX2-FP-NEXT: vmovaps 96(%r9), %ymm3 16777; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 16778; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 16779; AVX2-FP-NEXT: vbroadcastsd 112(%rax), %ymm5 16780; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 16781; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16782; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 16783; AVX2-FP-NEXT: vbroadcastsd 120(%rdx), %ymm1 16784; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm1[2,3] 16785; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 16786; AVX2-FP-NEXT: vbroadcastsd 120(%r10), %ymm1 16787; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16788; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16789; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0 16790; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm1 16791; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 16792; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 16793; AVX2-FP-NEXT: vbroadcastsd 144(%rcx), %ymm3 16794; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 16795; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16796; AVX2-FP-NEXT: vmovaps 128(%r8), %ymm2 16797; AVX2-FP-NEXT: vmovaps 128(%r9), %ymm3 16798; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 16799; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 16800; AVX2-FP-NEXT: vbroadcastsd 144(%rax), %ymm5 16801; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 16802; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16803; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 16804; AVX2-FP-NEXT: vbroadcastsd 152(%rdx), %ymm1 16805; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16806; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16807; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 16808; AVX2-FP-NEXT: vbroadcastsd 152(%r10), %ymm1 16809; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16810; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16811; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm0 16812; AVX2-FP-NEXT: vmovaps 160(%rsi), %ymm1 16813; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 16814; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 16815; AVX2-FP-NEXT: vbroadcastsd 176(%rcx), %ymm3 16816; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 16817; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16818; AVX2-FP-NEXT: vmovaps 160(%r8), %ymm2 16819; AVX2-FP-NEXT: vmovaps 160(%r9), %ymm3 16820; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 16821; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 16822; AVX2-FP-NEXT: vbroadcastsd 176(%rax), %ymm5 16823; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 16824; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16825; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 16826; AVX2-FP-NEXT: vbroadcastsd 184(%rdx), %ymm1 16827; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16828; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16829; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 16830; AVX2-FP-NEXT: vbroadcastsd 184(%r10), %ymm1 16831; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16832; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16833; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm0 16834; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm1 16835; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 16836; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 16837; AVX2-FP-NEXT: vbroadcastsd 208(%rcx), %ymm3 16838; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 16839; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16840; AVX2-FP-NEXT: vmovaps 192(%r8), %ymm2 16841; AVX2-FP-NEXT: vmovaps 192(%r9), %ymm3 16842; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 16843; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 16844; AVX2-FP-NEXT: vbroadcastsd 208(%rax), %ymm5 16845; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 16846; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16847; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 16848; AVX2-FP-NEXT: vbroadcastsd 216(%rdx), %ymm1 16849; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16850; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 16851; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 16852; AVX2-FP-NEXT: vbroadcastsd 216(%r10), %ymm1 16853; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16854; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16855; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm0 16856; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm1 16857; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 16858; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 16859; AVX2-FP-NEXT: vbroadcastsd 240(%rcx), %ymm3 16860; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 16861; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16862; AVX2-FP-NEXT: vmovaps 224(%r8), %ymm2 16863; AVX2-FP-NEXT: vmovaps 224(%r9), %ymm3 16864; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 16865; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 16866; AVX2-FP-NEXT: vbroadcastsd 240(%rax), %ymm5 16867; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 16868; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16869; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 16870; AVX2-FP-NEXT: vbroadcastsd 248(%rdx), %ymm1 16871; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16872; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16873; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 16874; AVX2-FP-NEXT: vbroadcastsd 248(%r10), %ymm1 16875; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16876; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16877; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm0 16878; AVX2-FP-NEXT: vmovaps 256(%rsi), %ymm1 16879; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 16880; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 16881; AVX2-FP-NEXT: vbroadcastsd 272(%rcx), %ymm3 16882; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 16883; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16884; AVX2-FP-NEXT: vmovaps 256(%r8), %ymm2 16885; AVX2-FP-NEXT: vmovaps 256(%r9), %ymm3 16886; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 16887; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 16888; AVX2-FP-NEXT: vbroadcastsd 272(%rax), %ymm5 16889; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 16890; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16891; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 16892; AVX2-FP-NEXT: vbroadcastsd 280(%rdx), %ymm1 16893; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16894; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16895; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 16896; AVX2-FP-NEXT: vbroadcastsd 280(%r10), %ymm1 16897; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16898; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16899; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm0 16900; AVX2-FP-NEXT: vmovaps 288(%rsi), %ymm1 16901; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 16902; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 16903; AVX2-FP-NEXT: vbroadcastsd 304(%rcx), %ymm3 16904; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 16905; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16906; AVX2-FP-NEXT: vmovaps 288(%r8), %ymm2 16907; AVX2-FP-NEXT: vmovaps 288(%r9), %ymm3 16908; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 16909; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 16910; AVX2-FP-NEXT: vbroadcastsd 304(%rax), %ymm5 16911; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 16912; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16913; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 16914; AVX2-FP-NEXT: vbroadcastsd 312(%rdx), %ymm1 16915; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16916; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16917; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 16918; AVX2-FP-NEXT: vbroadcastsd 312(%r10), %ymm1 16919; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16920; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16921; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm0 16922; AVX2-FP-NEXT: vmovaps 320(%rsi), %ymm1 16923; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 16924; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 16925; AVX2-FP-NEXT: vbroadcastsd 336(%rcx), %ymm3 16926; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 16927; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16928; AVX2-FP-NEXT: vmovaps 320(%r8), %ymm2 16929; AVX2-FP-NEXT: vmovaps 320(%r9), %ymm3 16930; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 16931; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 16932; AVX2-FP-NEXT: vbroadcastsd 336(%rax), %ymm5 16933; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 16934; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16935; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 16936; AVX2-FP-NEXT: vbroadcastsd 344(%rdx), %ymm1 16937; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16938; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16939; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 16940; AVX2-FP-NEXT: vbroadcastsd 344(%r10), %ymm1 16941; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16942; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16943; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm0 16944; AVX2-FP-NEXT: vmovaps 352(%rsi), %ymm1 16945; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 16946; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 16947; AVX2-FP-NEXT: vbroadcastsd 368(%rcx), %ymm3 16948; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 16949; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16950; AVX2-FP-NEXT: vmovaps 352(%r8), %ymm2 16951; AVX2-FP-NEXT: vmovaps 352(%r9), %ymm3 16952; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 16953; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 16954; AVX2-FP-NEXT: vbroadcastsd 368(%rax), %ymm5 16955; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 16956; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16957; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 16958; AVX2-FP-NEXT: vbroadcastsd 376(%rdx), %ymm1 16959; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16960; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16961; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 16962; AVX2-FP-NEXT: vbroadcastsd 376(%r10), %ymm1 16963; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16964; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16965; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm0 16966; AVX2-FP-NEXT: vmovaps 384(%rsi), %ymm1 16967; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 16968; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 16969; AVX2-FP-NEXT: vbroadcastsd 400(%rcx), %ymm3 16970; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 16971; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16972; AVX2-FP-NEXT: vmovaps 384(%r8), %ymm2 16973; AVX2-FP-NEXT: vmovaps 384(%r9), %ymm3 16974; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 16975; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 16976; AVX2-FP-NEXT: vbroadcastsd 400(%rax), %ymm5 16977; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 16978; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16979; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 16980; AVX2-FP-NEXT: vbroadcastsd 408(%rdx), %ymm1 16981; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16982; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16983; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 16984; AVX2-FP-NEXT: vbroadcastsd 408(%r10), %ymm1 16985; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 16986; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16987; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm0 16988; AVX2-FP-NEXT: vmovaps 416(%rsi), %ymm1 16989; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 16990; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 16991; AVX2-FP-NEXT: vbroadcastsd 432(%rcx), %ymm3 16992; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 16993; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16994; AVX2-FP-NEXT: vmovaps 416(%r8), %ymm2 16995; AVX2-FP-NEXT: vmovaps 416(%r9), %ymm3 16996; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 16997; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 16998; AVX2-FP-NEXT: vbroadcastsd 432(%rax), %ymm5 16999; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 17000; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17001; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 17002; AVX2-FP-NEXT: vbroadcastsd 440(%rdx), %ymm1 17003; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 17004; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17005; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 17006; AVX2-FP-NEXT: vbroadcastsd 440(%r10), %ymm1 17007; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 17008; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17009; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm0 17010; AVX2-FP-NEXT: vmovaps 448(%rsi), %ymm1 17011; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 17012; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 17013; AVX2-FP-NEXT: vbroadcastsd 464(%rcx), %ymm3 17014; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 17015; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17016; AVX2-FP-NEXT: vmovaps 448(%r8), %ymm2 17017; AVX2-FP-NEXT: vmovaps 448(%r9), %ymm3 17018; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 17019; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 17020; AVX2-FP-NEXT: vbroadcastsd 464(%rax), %ymm5 17021; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 17022; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17023; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 17024; AVX2-FP-NEXT: vbroadcastsd 472(%rdx), %ymm1 17025; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 17026; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17027; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 17028; AVX2-FP-NEXT: vbroadcastsd 472(%r10), %ymm1 17029; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 17030; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17031; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm0 17032; AVX2-FP-NEXT: vmovaps 480(%rsi), %ymm1 17033; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 17034; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 17035; AVX2-FP-NEXT: vbroadcastsd 496(%rcx), %ymm3 17036; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 17037; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17038; AVX2-FP-NEXT: vmovaps 480(%r8), %ymm3 17039; AVX2-FP-NEXT: vmovaps 480(%r9), %ymm4 17040; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] 17041; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 17042; AVX2-FP-NEXT: vbroadcastsd 496(%rax), %ymm5 17043; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm5[6,7] 17044; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 17045; AVX2-FP-NEXT: vbroadcastsd 504(%rdx), %ymm1 17046; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] 17047; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] 17048; AVX2-FP-NEXT: vbroadcastsd 504(%r10), %ymm1 17049; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] 17050; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rdx 17051; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] 17052; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17053; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],mem[6,7] 17054; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17055; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],mem[6,7] 17056; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17057; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] 17058; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17059; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],mem[6,7] 17060; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17061; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] 17062; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17063; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],mem[6,7] 17064; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17065; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17066; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 17067; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17068; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17069; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 17070; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17071; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17072; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 17073; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17074; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17075; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 17076; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17077; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17078; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 17079; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17080; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 17081; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 17082; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 17083; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17084; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 17085; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17086; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17087; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 17088; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17089; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17090; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 17091; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17092; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17093; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 17094; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17095; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17096; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] 17097; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17098; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],mem[6,7] 17099; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17100; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],mem[6,7] 17101; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17102; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],mem[6,7] 17103; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17104; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],mem[6,7] 17105; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17106; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] 17107; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17108; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] 17109; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17110; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],mem[6,7] 17111; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17112; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] 17113; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17114; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] 17115; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17116; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] 17117; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17118; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] 17119; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17120; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 17121; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] 17122; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] 17123; AVX2-FP-NEXT: vmovaps %ymm9, 4064(%rdx) 17124; AVX2-FP-NEXT: vmovaps %ymm11, 4032(%rdx) 17125; AVX2-FP-NEXT: vmovaps %ymm15, 4000(%rdx) 17126; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 17127; AVX2-FP-NEXT: vmovaps %ymm9, 3968(%rdx) 17128; AVX2-FP-NEXT: vmovaps %ymm0, 3808(%rdx) 17129; AVX2-FP-NEXT: vmovaps %ymm1, 3776(%rdx) 17130; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17131; AVX2-FP-NEXT: vmovaps %ymm0, 3744(%rdx) 17132; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17133; AVX2-FP-NEXT: vmovaps %ymm0, 3712(%rdx) 17134; AVX2-FP-NEXT: vmovaps %ymm2, 3552(%rdx) 17135; AVX2-FP-NEXT: vmovaps %ymm3, 3520(%rdx) 17136; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17137; AVX2-FP-NEXT: vmovaps %ymm0, 3488(%rdx) 17138; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17139; AVX2-FP-NEXT: vmovaps %ymm0, 3456(%rdx) 17140; AVX2-FP-NEXT: vmovaps %ymm4, 3296(%rdx) 17141; AVX2-FP-NEXT: vmovaps %ymm5, 3264(%rdx) 17142; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17143; AVX2-FP-NEXT: vmovaps %ymm0, 3232(%rdx) 17144; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17145; AVX2-FP-NEXT: vmovaps %ymm0, 3200(%rdx) 17146; AVX2-FP-NEXT: vmovaps %ymm6, 3040(%rdx) 17147; AVX2-FP-NEXT: vmovaps %ymm7, 3008(%rdx) 17148; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17149; AVX2-FP-NEXT: vmovaps %ymm0, 2976(%rdx) 17150; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17151; AVX2-FP-NEXT: vmovaps %ymm0, 2944(%rdx) 17152; AVX2-FP-NEXT: vmovaps %ymm8, 2784(%rdx) 17153; AVX2-FP-NEXT: vmovaps %ymm10, 2752(%rdx) 17154; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17155; AVX2-FP-NEXT: vmovaps %ymm0, 2720(%rdx) 17156; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17157; AVX2-FP-NEXT: vmovaps %ymm0, 2688(%rdx) 17158; AVX2-FP-NEXT: vmovaps %ymm12, 2528(%rdx) 17159; AVX2-FP-NEXT: vmovaps %ymm13, 2496(%rdx) 17160; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17161; AVX2-FP-NEXT: vmovaps %ymm0, 2464(%rdx) 17162; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17163; AVX2-FP-NEXT: vmovaps %ymm0, 2432(%rdx) 17164; AVX2-FP-NEXT: vmovaps %ymm14, 2272(%rdx) 17165; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17166; AVX2-FP-NEXT: vmovaps %ymm0, 2240(%rdx) 17167; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17168; AVX2-FP-NEXT: vmovaps %ymm0, 2208(%rdx) 17169; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17170; AVX2-FP-NEXT: vmovaps %ymm0, 2176(%rdx) 17171; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17172; AVX2-FP-NEXT: vmovaps %ymm0, 2016(%rdx) 17173; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17174; AVX2-FP-NEXT: vmovaps %ymm0, 1984(%rdx) 17175; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17176; AVX2-FP-NEXT: vmovaps %ymm0, 1952(%rdx) 17177; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17178; AVX2-FP-NEXT: vmovaps %ymm0, 1920(%rdx) 17179; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17180; AVX2-FP-NEXT: vmovaps %ymm0, 1760(%rdx) 17181; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 17182; AVX2-FP-NEXT: vmovaps %ymm0, 1728(%rdx) 17183; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17184; AVX2-FP-NEXT: vmovaps %ymm0, 1696(%rdx) 17185; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17186; AVX2-FP-NEXT: vmovaps %ymm0, 1664(%rdx) 17187; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17188; AVX2-FP-NEXT: vmovaps %ymm0, 1504(%rdx) 17189; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17190; AVX2-FP-NEXT: vmovaps %ymm0, 1472(%rdx) 17191; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17192; AVX2-FP-NEXT: vmovaps %ymm0, 1440(%rdx) 17193; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17194; AVX2-FP-NEXT: vmovaps %ymm0, 1408(%rdx) 17195; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17196; AVX2-FP-NEXT: vmovaps %ymm0, 1248(%rdx) 17197; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17198; AVX2-FP-NEXT: vmovaps %ymm0, 1216(%rdx) 17199; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17200; AVX2-FP-NEXT: vmovaps %ymm0, 1184(%rdx) 17201; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17202; AVX2-FP-NEXT: vmovaps %ymm0, 1152(%rdx) 17203; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17204; AVX2-FP-NEXT: vmovaps %ymm0, 992(%rdx) 17205; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17206; AVX2-FP-NEXT: vmovaps %ymm0, 960(%rdx) 17207; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17208; AVX2-FP-NEXT: vmovaps %ymm0, 928(%rdx) 17209; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17210; AVX2-FP-NEXT: vmovaps %ymm0, 896(%rdx) 17211; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17212; AVX2-FP-NEXT: vmovaps %ymm0, 736(%rdx) 17213; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17214; AVX2-FP-NEXT: vmovaps %ymm0, 704(%rdx) 17215; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17216; AVX2-FP-NEXT: vmovaps %ymm0, 672(%rdx) 17217; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17218; AVX2-FP-NEXT: vmovaps %ymm0, 640(%rdx) 17219; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17220; AVX2-FP-NEXT: vmovaps %ymm0, 480(%rdx) 17221; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17222; AVX2-FP-NEXT: vmovaps %ymm0, 448(%rdx) 17223; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17224; AVX2-FP-NEXT: vmovaps %ymm0, 416(%rdx) 17225; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17226; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rdx) 17227; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17228; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rdx) 17229; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17230; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rdx) 17231; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17232; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rdx) 17233; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17234; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rdx) 17235; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17236; AVX2-FP-NEXT: vmovaps %ymm0, 3936(%rdx) 17237; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17238; AVX2-FP-NEXT: vmovaps %ymm0, 3904(%rdx) 17239; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17240; AVX2-FP-NEXT: vmovaps %ymm0, 3872(%rdx) 17241; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17242; AVX2-FP-NEXT: vmovaps %ymm0, 3840(%rdx) 17243; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17244; AVX2-FP-NEXT: vmovaps %ymm0, 3680(%rdx) 17245; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17246; AVX2-FP-NEXT: vmovaps %ymm0, 3648(%rdx) 17247; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17248; AVX2-FP-NEXT: vmovaps %ymm0, 3616(%rdx) 17249; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17250; AVX2-FP-NEXT: vmovaps %ymm0, 3584(%rdx) 17251; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17252; AVX2-FP-NEXT: vmovaps %ymm0, 3424(%rdx) 17253; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17254; AVX2-FP-NEXT: vmovaps %ymm0, 3392(%rdx) 17255; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17256; AVX2-FP-NEXT: vmovaps %ymm0, 3360(%rdx) 17257; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17258; AVX2-FP-NEXT: vmovaps %ymm0, 3328(%rdx) 17259; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17260; AVX2-FP-NEXT: vmovaps %ymm0, 3168(%rdx) 17261; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17262; AVX2-FP-NEXT: vmovaps %ymm0, 3136(%rdx) 17263; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17264; AVX2-FP-NEXT: vmovaps %ymm0, 3104(%rdx) 17265; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17266; AVX2-FP-NEXT: vmovaps %ymm0, 3072(%rdx) 17267; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17268; AVX2-FP-NEXT: vmovaps %ymm0, 2912(%rdx) 17269; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17270; AVX2-FP-NEXT: vmovaps %ymm0, 2880(%rdx) 17271; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17272; AVX2-FP-NEXT: vmovaps %ymm0, 2848(%rdx) 17273; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17274; AVX2-FP-NEXT: vmovaps %ymm0, 2816(%rdx) 17275; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17276; AVX2-FP-NEXT: vmovaps %ymm0, 2656(%rdx) 17277; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17278; AVX2-FP-NEXT: vmovaps %ymm0, 2624(%rdx) 17279; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17280; AVX2-FP-NEXT: vmovaps %ymm0, 2592(%rdx) 17281; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17282; AVX2-FP-NEXT: vmovaps %ymm0, 2560(%rdx) 17283; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17284; AVX2-FP-NEXT: vmovaps %ymm0, 2400(%rdx) 17285; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17286; AVX2-FP-NEXT: vmovaps %ymm0, 2368(%rdx) 17287; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17288; AVX2-FP-NEXT: vmovaps %ymm0, 2336(%rdx) 17289; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17290; AVX2-FP-NEXT: vmovaps %ymm0, 2304(%rdx) 17291; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17292; AVX2-FP-NEXT: vmovaps %ymm0, 2144(%rdx) 17293; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17294; AVX2-FP-NEXT: vmovaps %ymm0, 2112(%rdx) 17295; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17296; AVX2-FP-NEXT: vmovaps %ymm0, 2080(%rdx) 17297; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17298; AVX2-FP-NEXT: vmovaps %ymm0, 2048(%rdx) 17299; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17300; AVX2-FP-NEXT: vmovaps %ymm0, 1888(%rdx) 17301; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17302; AVX2-FP-NEXT: vmovaps %ymm0, 1856(%rdx) 17303; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17304; AVX2-FP-NEXT: vmovaps %ymm0, 1824(%rdx) 17305; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17306; AVX2-FP-NEXT: vmovaps %ymm0, 1792(%rdx) 17307; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17308; AVX2-FP-NEXT: vmovaps %ymm0, 1632(%rdx) 17309; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17310; AVX2-FP-NEXT: vmovaps %ymm0, 1600(%rdx) 17311; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17312; AVX2-FP-NEXT: vmovaps %ymm0, 1568(%rdx) 17313; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17314; AVX2-FP-NEXT: vmovaps %ymm0, 1536(%rdx) 17315; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17316; AVX2-FP-NEXT: vmovaps %ymm0, 1376(%rdx) 17317; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17318; AVX2-FP-NEXT: vmovaps %ymm0, 1344(%rdx) 17319; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17320; AVX2-FP-NEXT: vmovaps %ymm0, 1312(%rdx) 17321; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17322; AVX2-FP-NEXT: vmovaps %ymm0, 1280(%rdx) 17323; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17324; AVX2-FP-NEXT: vmovaps %ymm0, 1120(%rdx) 17325; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17326; AVX2-FP-NEXT: vmovaps %ymm0, 1088(%rdx) 17327; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17328; AVX2-FP-NEXT: vmovaps %ymm0, 1056(%rdx) 17329; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17330; AVX2-FP-NEXT: vmovaps %ymm0, 1024(%rdx) 17331; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17332; AVX2-FP-NEXT: vmovaps %ymm0, 864(%rdx) 17333; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17334; AVX2-FP-NEXT: vmovaps %ymm0, 832(%rdx) 17335; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17336; AVX2-FP-NEXT: vmovaps %ymm0, 800(%rdx) 17337; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17338; AVX2-FP-NEXT: vmovaps %ymm0, 768(%rdx) 17339; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17340; AVX2-FP-NEXT: vmovaps %ymm0, 608(%rdx) 17341; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17342; AVX2-FP-NEXT: vmovaps %ymm0, 576(%rdx) 17343; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17344; AVX2-FP-NEXT: vmovaps %ymm0, 544(%rdx) 17345; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17346; AVX2-FP-NEXT: vmovaps %ymm0, 512(%rdx) 17347; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17348; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rdx) 17349; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17350; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rdx) 17351; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17352; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rdx) 17353; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17354; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rdx) 17355; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17356; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rdx) 17357; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17358; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rdx) 17359; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17360; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rdx) 17361; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 17362; AVX2-FP-NEXT: vmovaps %ymm0, (%rdx) 17363; AVX2-FP-NEXT: addq $3880, %rsp # imm = 0xF28 17364; AVX2-FP-NEXT: vzeroupper 17365; AVX2-FP-NEXT: retq 17366; 17367; AVX2-FCP-LABEL: store_i64_stride8_vf64: 17368; AVX2-FCP: # %bb.0: 17369; AVX2-FCP-NEXT: subq $3880, %rsp # imm = 0xF28 17370; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 17371; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 17372; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm0 17373; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17374; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 17375; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm2 17376; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17377; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm3 17378; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17379; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm1 17380; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17381; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm4 17382; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17383; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 17384; AVX2-FCP-NEXT: vbroadcastsd 8(%rdx), %ymm2 17385; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 17386; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 17387; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17388; AVX2-FCP-NEXT: vmovaps (%rax), %xmm0 17389; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17390; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 17391; AVX2-FCP-NEXT: vmovaps (%r9), %xmm2 17392; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17393; AVX2-FCP-NEXT: vmovaps (%r8), %xmm1 17394; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17395; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 17396; AVX2-FCP-NEXT: vbroadcastsd 8(%r10), %ymm2 17397; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 17398; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 17399; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17400; AVX2-FCP-NEXT: vmovaps 32(%rcx), %xmm0 17401; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17402; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 17403; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1] 17404; AVX2-FCP-NEXT: vbroadcastsd 40(%rdx), %ymm2 17405; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 17406; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 17407; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17408; AVX2-FCP-NEXT: vmovaps 32(%r9), %xmm1 17409; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17410; AVX2-FCP-NEXT: vmovaps 32(%r8), %xmm0 17411; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17412; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17413; AVX2-FCP-NEXT: vbroadcastsd 40(%r10), %ymm1 17414; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17415; AVX2-FCP-NEXT: vmovaps 32(%rax), %xmm1 17416; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17417; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17418; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17419; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17420; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm1 17421; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17422; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm0 17423; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17424; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17425; AVX2-FCP-NEXT: vbroadcastsd 72(%rdx), %ymm1 17426; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17427; AVX2-FCP-NEXT: vmovaps 64(%rcx), %xmm1 17428; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17429; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17430; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17431; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17432; AVX2-FCP-NEXT: vmovaps 64(%r9), %xmm1 17433; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17434; AVX2-FCP-NEXT: vmovaps 64(%r8), %xmm0 17435; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17436; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17437; AVX2-FCP-NEXT: vbroadcastsd 72(%r10), %ymm1 17438; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17439; AVX2-FCP-NEXT: vmovaps 64(%rax), %xmm1 17440; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17441; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17442; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17443; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17444; AVX2-FCP-NEXT: vmovaps 96(%rsi), %xmm1 17445; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17446; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm0 17447; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17448; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17449; AVX2-FCP-NEXT: vbroadcastsd 104(%rdx), %ymm1 17450; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17451; AVX2-FCP-NEXT: vmovaps 96(%rcx), %xmm1 17452; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17453; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17454; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17455; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17456; AVX2-FCP-NEXT: vmovaps 96(%r9), %xmm1 17457; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17458; AVX2-FCP-NEXT: vmovaps 96(%r8), %xmm0 17459; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17460; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17461; AVX2-FCP-NEXT: vbroadcastsd 104(%r10), %ymm1 17462; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17463; AVX2-FCP-NEXT: vmovaps 96(%rax), %xmm1 17464; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17465; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17466; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17467; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17468; AVX2-FCP-NEXT: vmovaps 128(%rsi), %xmm1 17469; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17470; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm0 17471; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17472; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17473; AVX2-FCP-NEXT: vbroadcastsd 136(%rdx), %ymm1 17474; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17475; AVX2-FCP-NEXT: vmovaps 128(%rcx), %xmm1 17476; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17477; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17478; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17479; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17480; AVX2-FCP-NEXT: vmovaps 128(%r9), %xmm1 17481; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17482; AVX2-FCP-NEXT: vmovaps 128(%r8), %xmm0 17483; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17484; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17485; AVX2-FCP-NEXT: vbroadcastsd 136(%r10), %ymm1 17486; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17487; AVX2-FCP-NEXT: vmovaps 128(%rax), %xmm1 17488; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17489; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17490; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17491; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17492; AVX2-FCP-NEXT: vmovaps 160(%rsi), %xmm1 17493; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17494; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm0 17495; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17496; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17497; AVX2-FCP-NEXT: vbroadcastsd 168(%rdx), %ymm1 17498; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17499; AVX2-FCP-NEXT: vmovaps 160(%rcx), %xmm1 17500; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17501; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17502; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17503; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17504; AVX2-FCP-NEXT: vmovaps 160(%r9), %xmm1 17505; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17506; AVX2-FCP-NEXT: vmovaps 160(%r8), %xmm0 17507; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17508; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17509; AVX2-FCP-NEXT: vbroadcastsd 168(%r10), %ymm1 17510; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17511; AVX2-FCP-NEXT: vmovaps 160(%rax), %xmm1 17512; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17513; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17514; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17515; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17516; AVX2-FCP-NEXT: vmovaps 192(%rsi), %xmm1 17517; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17518; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm0 17519; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17520; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17521; AVX2-FCP-NEXT: vbroadcastsd 200(%rdx), %ymm1 17522; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17523; AVX2-FCP-NEXT: vmovaps 192(%rcx), %xmm1 17524; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17525; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17526; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17527; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17528; AVX2-FCP-NEXT: vmovaps 192(%r9), %xmm1 17529; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17530; AVX2-FCP-NEXT: vmovaps 192(%r8), %xmm0 17531; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17532; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17533; AVX2-FCP-NEXT: vbroadcastsd 200(%r10), %ymm1 17534; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17535; AVX2-FCP-NEXT: vmovaps 192(%rax), %xmm1 17536; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17537; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17538; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17539; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17540; AVX2-FCP-NEXT: vmovaps 224(%rsi), %xmm1 17541; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17542; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm0 17543; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17544; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17545; AVX2-FCP-NEXT: vbroadcastsd 232(%rdx), %ymm1 17546; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17547; AVX2-FCP-NEXT: vmovaps 224(%rcx), %xmm1 17548; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17549; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17550; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17551; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17552; AVX2-FCP-NEXT: vmovaps 224(%r9), %xmm1 17553; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17554; AVX2-FCP-NEXT: vmovaps 224(%r8), %xmm0 17555; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17556; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17557; AVX2-FCP-NEXT: vbroadcastsd 232(%r10), %ymm1 17558; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17559; AVX2-FCP-NEXT: vmovaps 224(%rax), %xmm1 17560; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17561; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17562; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17563; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17564; AVX2-FCP-NEXT: vmovaps 256(%rsi), %xmm1 17565; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17566; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm0 17567; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17568; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17569; AVX2-FCP-NEXT: vbroadcastsd 264(%rdx), %ymm1 17570; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17571; AVX2-FCP-NEXT: vmovaps 256(%rcx), %xmm1 17572; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17573; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17574; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17575; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17576; AVX2-FCP-NEXT: vmovaps 256(%r9), %xmm1 17577; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17578; AVX2-FCP-NEXT: vmovaps 256(%r8), %xmm0 17579; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17580; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17581; AVX2-FCP-NEXT: vbroadcastsd 264(%r10), %ymm1 17582; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17583; AVX2-FCP-NEXT: vmovaps 256(%rax), %xmm1 17584; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17585; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17586; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17587; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17588; AVX2-FCP-NEXT: vmovaps 288(%rsi), %xmm1 17589; AVX2-FCP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill 17590; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm0 17591; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17592; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17593; AVX2-FCP-NEXT: vbroadcastsd 296(%rdx), %ymm1 17594; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17595; AVX2-FCP-NEXT: vmovaps 288(%rcx), %xmm1 17596; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17597; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17598; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17599; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17600; AVX2-FCP-NEXT: vmovaps 288(%r9), %xmm1 17601; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17602; AVX2-FCP-NEXT: vmovaps 288(%r8), %xmm0 17603; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17604; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17605; AVX2-FCP-NEXT: vbroadcastsd 296(%r10), %ymm1 17606; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17607; AVX2-FCP-NEXT: vmovaps 288(%rax), %xmm1 17608; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17609; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17610; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17611; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17612; AVX2-FCP-NEXT: vmovaps 320(%rsi), %xmm1 17613; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17614; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm0 17615; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17616; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17617; AVX2-FCP-NEXT: vbroadcastsd 328(%rdx), %ymm1 17618; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17619; AVX2-FCP-NEXT: vmovaps 320(%rcx), %xmm1 17620; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17621; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17622; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17623; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17624; AVX2-FCP-NEXT: vmovaps 320(%r9), %xmm1 17625; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17626; AVX2-FCP-NEXT: vmovaps 320(%r8), %xmm0 17627; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17628; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17629; AVX2-FCP-NEXT: vbroadcastsd 328(%r10), %ymm1 17630; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17631; AVX2-FCP-NEXT: vmovaps 320(%rax), %xmm1 17632; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17633; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17634; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17635; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17636; AVX2-FCP-NEXT: vmovaps 352(%rsi), %xmm1 17637; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17638; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm0 17639; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17640; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17641; AVX2-FCP-NEXT: vbroadcastsd 360(%rdx), %ymm1 17642; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17643; AVX2-FCP-NEXT: vmovaps 352(%rcx), %xmm1 17644; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17645; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17646; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17647; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17648; AVX2-FCP-NEXT: vmovaps 352(%r9), %xmm1 17649; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17650; AVX2-FCP-NEXT: vmovaps 352(%r8), %xmm0 17651; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17652; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17653; AVX2-FCP-NEXT: vbroadcastsd 360(%r10), %ymm1 17654; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17655; AVX2-FCP-NEXT: vmovaps 352(%rax), %xmm1 17656; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17657; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17658; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17659; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17660; AVX2-FCP-NEXT: vmovaps 384(%rsi), %xmm1 17661; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17662; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm0 17663; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17664; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17665; AVX2-FCP-NEXT: vbroadcastsd 392(%rdx), %ymm1 17666; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17667; AVX2-FCP-NEXT: vmovaps 384(%rcx), %xmm1 17668; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17669; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17670; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17671; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17672; AVX2-FCP-NEXT: vmovaps 384(%r9), %xmm1 17673; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17674; AVX2-FCP-NEXT: vmovaps 384(%r8), %xmm0 17675; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17676; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17677; AVX2-FCP-NEXT: vbroadcastsd 392(%r10), %ymm1 17678; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17679; AVX2-FCP-NEXT: vmovaps 384(%rax), %xmm1 17680; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17681; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17682; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17683; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17684; AVX2-FCP-NEXT: vmovaps 416(%rsi), %xmm1 17685; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17686; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm0 17687; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17688; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 17689; AVX2-FCP-NEXT: vbroadcastsd 424(%rdx), %ymm1 17690; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17691; AVX2-FCP-NEXT: vmovaps 416(%rcx), %xmm1 17692; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17693; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 17694; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17695; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17696; AVX2-FCP-NEXT: vmovaps 416(%r9), %xmm0 17697; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 17698; AVX2-FCP-NEXT: vmovaps 416(%r8), %xmm13 17699; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] 17700; AVX2-FCP-NEXT: vbroadcastsd 424(%r10), %ymm1 17701; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17702; AVX2-FCP-NEXT: vmovaps 416(%rax), %xmm12 17703; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 17704; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17705; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17706; AVX2-FCP-NEXT: vmovaps 448(%rsi), %xmm11 17707; AVX2-FCP-NEXT: vmovaps 448(%rdi), %xmm10 17708; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] 17709; AVX2-FCP-NEXT: vbroadcastsd 456(%rdx), %ymm1 17710; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17711; AVX2-FCP-NEXT: vmovaps 448(%rcx), %xmm9 17712; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 17713; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17714; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17715; AVX2-FCP-NEXT: vmovaps 448(%r9), %xmm8 17716; AVX2-FCP-NEXT: vmovaps 448(%r8), %xmm7 17717; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] 17718; AVX2-FCP-NEXT: vbroadcastsd 456(%r10), %ymm1 17719; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17720; AVX2-FCP-NEXT: vmovaps 448(%rax), %xmm6 17721; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 17722; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17723; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17724; AVX2-FCP-NEXT: vmovaps 480(%rsi), %xmm5 17725; AVX2-FCP-NEXT: vmovaps 480(%rdi), %xmm4 17726; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] 17727; AVX2-FCP-NEXT: vbroadcastsd 488(%rdx), %ymm1 17728; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17729; AVX2-FCP-NEXT: vmovaps 480(%rcx), %xmm3 17730; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 17731; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 17732; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17733; AVX2-FCP-NEXT: vmovaps 480(%r9), %xmm2 17734; AVX2-FCP-NEXT: vmovaps 480(%r8), %xmm1 17735; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] 17736; AVX2-FCP-NEXT: vbroadcastsd 488(%r10), %ymm15 17737; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] 17738; AVX2-FCP-NEXT: vmovaps 480(%rax), %xmm0 17739; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 17740; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] 17741; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17742; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17743; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17744; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17745; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm14, %ymm14 17746; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17747; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17748; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17749; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17750; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17751; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17752; AVX2-FCP-NEXT: vinsertf128 $1, (%r10), %ymm14, %ymm14 17753; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17754; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17755; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17756; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17757; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17758; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17759; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rdx), %ymm14, %ymm14 17760; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17761; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17762; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17763; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17764; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17765; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17766; AVX2-FCP-NEXT: vinsertf128 $1, 32(%r10), %ymm14, %ymm14 17767; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17768; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17769; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17770; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17771; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17772; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17773; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdx), %ymm14, %ymm14 17774; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17775; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17776; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17777; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17778; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17779; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17780; AVX2-FCP-NEXT: vinsertf128 $1, 64(%r10), %ymm14, %ymm14 17781; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17782; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17783; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17784; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17785; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17786; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17787; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdx), %ymm14, %ymm14 17788; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17789; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17790; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17791; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17792; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17793; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17794; AVX2-FCP-NEXT: vinsertf128 $1, 96(%r10), %ymm14, %ymm14 17795; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17796; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17797; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17798; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17799; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17800; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17801; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdx), %ymm14, %ymm14 17802; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17803; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17804; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17805; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17806; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17807; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17808; AVX2-FCP-NEXT: vinsertf128 $1, 128(%r10), %ymm14, %ymm14 17809; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17810; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17811; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17812; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17813; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17814; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17815; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdx), %ymm14, %ymm14 17816; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17817; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17818; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17819; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17820; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17821; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17822; AVX2-FCP-NEXT: vinsertf128 $1, 160(%r10), %ymm14, %ymm14 17823; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17824; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17825; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17826; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17827; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17828; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17829; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdx), %ymm14, %ymm14 17830; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17831; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17832; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17833; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17834; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17835; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17836; AVX2-FCP-NEXT: vinsertf128 $1, 192(%r10), %ymm14, %ymm14 17837; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17838; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17839; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17840; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17841; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17842; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17843; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdx), %ymm14, %ymm14 17844; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17845; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17846; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17847; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17848; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17849; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17850; AVX2-FCP-NEXT: vinsertf128 $1, 224(%r10), %ymm14, %ymm14 17851; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17852; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17853; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17854; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17855; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17856; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17857; AVX2-FCP-NEXT: vinsertf128 $1, 256(%rdx), %ymm14, %ymm14 17858; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17859; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17860; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17861; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17862; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17863; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17864; AVX2-FCP-NEXT: vinsertf128 $1, 256(%r10), %ymm14, %ymm14 17865; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17866; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17867; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17868; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17869; AVX2-FCP-NEXT: vunpcklpd (%rsp), %xmm14, %xmm14 # 16-byte Folded Reload 17870; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17871; AVX2-FCP-NEXT: vinsertf128 $1, 288(%rdx), %ymm14, %ymm14 17872; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17873; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17874; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17875; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17876; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17877; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17878; AVX2-FCP-NEXT: vinsertf128 $1, 288(%r10), %ymm14, %ymm14 17879; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17880; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17881; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17882; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17883; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17884; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17885; AVX2-FCP-NEXT: vinsertf128 $1, 320(%rdx), %ymm14, %ymm14 17886; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17887; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17888; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17889; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17890; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17891; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17892; AVX2-FCP-NEXT: vinsertf128 $1, 320(%r10), %ymm14, %ymm14 17893; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17894; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17895; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17896; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17897; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17898; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17899; AVX2-FCP-NEXT: vinsertf128 $1, 352(%rdx), %ymm14, %ymm14 17900; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17901; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17902; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17903; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17904; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17905; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17906; AVX2-FCP-NEXT: vinsertf128 $1, 352(%r10), %ymm14, %ymm14 17907; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17908; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17909; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17910; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17911; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17912; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17913; AVX2-FCP-NEXT: vinsertf128 $1, 384(%rdx), %ymm14, %ymm14 17914; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17915; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17916; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17917; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17918; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17919; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17920; AVX2-FCP-NEXT: vinsertf128 $1, 384(%r10), %ymm14, %ymm14 17921; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17922; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17923; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17924; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 17925; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 17926; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] 17927; AVX2-FCP-NEXT: vinsertf128 $1, 416(%rdx), %ymm14, %ymm14 17928; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload 17929; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 17930; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17931; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 17932; AVX2-FCP-NEXT: # xmm13 = xmm13[0],mem[0] 17933; AVX2-FCP-NEXT: vinsertf128 $1, 416(%r10), %ymm13, %ymm13 17934; AVX2-FCP-NEXT: vbroadcastsd %xmm12, %ymm12 17935; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 17936; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17937; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] 17938; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdx), %ymm10, %ymm10 17939; AVX2-FCP-NEXT: vbroadcastsd %xmm9, %ymm9 17940; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] 17941; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17942; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] 17943; AVX2-FCP-NEXT: vinsertf128 $1, 448(%r10), %ymm7, %ymm7 17944; AVX2-FCP-NEXT: vbroadcastsd %xmm6, %ymm6 17945; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] 17946; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17947; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 17948; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rdx), %ymm4, %ymm4 17949; AVX2-FCP-NEXT: vbroadcastsd %xmm3, %ymm3 17950; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] 17951; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17952; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 17953; AVX2-FCP-NEXT: vinsertf128 $1, 480(%r10), %ymm1, %ymm1 17954; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0 17955; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 17956; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17957; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 17958; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1 17959; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 17960; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 17961; AVX2-FCP-NEXT: vbroadcastsd 16(%rcx), %ymm3 17962; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 17963; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17964; AVX2-FCP-NEXT: vmovaps (%r8), %ymm2 17965; AVX2-FCP-NEXT: vmovaps (%r9), %ymm3 17966; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 17967; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 17968; AVX2-FCP-NEXT: vbroadcastsd 16(%rax), %ymm5 17969; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 17970; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17971; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 17972; AVX2-FCP-NEXT: vbroadcastsd 24(%rdx), %ymm1 17973; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] 17974; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 17975; AVX2-FCP-NEXT: vbroadcastsd 24(%r10), %ymm1 17976; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm1[2,3] 17977; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0 17978; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm1 17979; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 17980; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 17981; AVX2-FCP-NEXT: vbroadcastsd 48(%rcx), %ymm3 17982; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 17983; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17984; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm2 17985; AVX2-FCP-NEXT: vmovaps 32(%r9), %ymm3 17986; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 17987; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 17988; AVX2-FCP-NEXT: vbroadcastsd 48(%rax), %ymm5 17989; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 17990; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17991; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 17992; AVX2-FCP-NEXT: vbroadcastsd 56(%rdx), %ymm1 17993; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],ymm1[2,3] 17994; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 17995; AVX2-FCP-NEXT: vbroadcastsd 56(%r10), %ymm1 17996; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] 17997; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm0 17998; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm1 17999; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 18000; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 18001; AVX2-FCP-NEXT: vbroadcastsd 80(%rcx), %ymm3 18002; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 18003; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18004; AVX2-FCP-NEXT: vmovaps 64(%r8), %ymm2 18005; AVX2-FCP-NEXT: vmovaps 64(%r9), %ymm3 18006; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 18007; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 18008; AVX2-FCP-NEXT: vbroadcastsd 80(%rax), %ymm5 18009; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 18010; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18011; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 18012; AVX2-FCP-NEXT: vbroadcastsd 88(%rdx), %ymm1 18013; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] 18014; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 18015; AVX2-FCP-NEXT: vbroadcastsd 88(%r10), %ymm1 18016; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3] 18017; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm0 18018; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm1 18019; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 18020; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 18021; AVX2-FCP-NEXT: vbroadcastsd 112(%rcx), %ymm3 18022; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 18023; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18024; AVX2-FCP-NEXT: vmovaps 96(%r8), %ymm2 18025; AVX2-FCP-NEXT: vmovaps 96(%r9), %ymm3 18026; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 18027; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 18028; AVX2-FCP-NEXT: vbroadcastsd 112(%rax), %ymm5 18029; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 18030; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18031; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 18032; AVX2-FCP-NEXT: vbroadcastsd 120(%rdx), %ymm1 18033; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm1[2,3] 18034; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 18035; AVX2-FCP-NEXT: vbroadcastsd 120(%r10), %ymm1 18036; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18037; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18038; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm0 18039; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm1 18040; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 18041; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 18042; AVX2-FCP-NEXT: vbroadcastsd 144(%rcx), %ymm3 18043; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 18044; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18045; AVX2-FCP-NEXT: vmovaps 128(%r8), %ymm2 18046; AVX2-FCP-NEXT: vmovaps 128(%r9), %ymm3 18047; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 18048; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 18049; AVX2-FCP-NEXT: vbroadcastsd 144(%rax), %ymm5 18050; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 18051; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18052; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 18053; AVX2-FCP-NEXT: vbroadcastsd 152(%rdx), %ymm1 18054; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18055; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18056; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 18057; AVX2-FCP-NEXT: vbroadcastsd 152(%r10), %ymm1 18058; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18059; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18060; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm0 18061; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm1 18062; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 18063; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 18064; AVX2-FCP-NEXT: vbroadcastsd 176(%rcx), %ymm3 18065; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 18066; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18067; AVX2-FCP-NEXT: vmovaps 160(%r8), %ymm2 18068; AVX2-FCP-NEXT: vmovaps 160(%r9), %ymm3 18069; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 18070; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 18071; AVX2-FCP-NEXT: vbroadcastsd 176(%rax), %ymm5 18072; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 18073; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18074; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 18075; AVX2-FCP-NEXT: vbroadcastsd 184(%rdx), %ymm1 18076; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18077; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18078; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 18079; AVX2-FCP-NEXT: vbroadcastsd 184(%r10), %ymm1 18080; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18081; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18082; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm0 18083; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm1 18084; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 18085; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 18086; AVX2-FCP-NEXT: vbroadcastsd 208(%rcx), %ymm3 18087; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 18088; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18089; AVX2-FCP-NEXT: vmovaps 192(%r8), %ymm2 18090; AVX2-FCP-NEXT: vmovaps 192(%r9), %ymm3 18091; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 18092; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 18093; AVX2-FCP-NEXT: vbroadcastsd 208(%rax), %ymm5 18094; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 18095; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18096; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 18097; AVX2-FCP-NEXT: vbroadcastsd 216(%rdx), %ymm1 18098; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18099; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 18100; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 18101; AVX2-FCP-NEXT: vbroadcastsd 216(%r10), %ymm1 18102; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18103; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18104; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm0 18105; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm1 18106; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 18107; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 18108; AVX2-FCP-NEXT: vbroadcastsd 240(%rcx), %ymm3 18109; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 18110; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18111; AVX2-FCP-NEXT: vmovaps 224(%r8), %ymm2 18112; AVX2-FCP-NEXT: vmovaps 224(%r9), %ymm3 18113; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 18114; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 18115; AVX2-FCP-NEXT: vbroadcastsd 240(%rax), %ymm5 18116; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 18117; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18118; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 18119; AVX2-FCP-NEXT: vbroadcastsd 248(%rdx), %ymm1 18120; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18121; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18122; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 18123; AVX2-FCP-NEXT: vbroadcastsd 248(%r10), %ymm1 18124; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18125; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18126; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm0 18127; AVX2-FCP-NEXT: vmovaps 256(%rsi), %ymm1 18128; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 18129; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 18130; AVX2-FCP-NEXT: vbroadcastsd 272(%rcx), %ymm3 18131; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 18132; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18133; AVX2-FCP-NEXT: vmovaps 256(%r8), %ymm2 18134; AVX2-FCP-NEXT: vmovaps 256(%r9), %ymm3 18135; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 18136; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 18137; AVX2-FCP-NEXT: vbroadcastsd 272(%rax), %ymm5 18138; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 18139; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18140; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 18141; AVX2-FCP-NEXT: vbroadcastsd 280(%rdx), %ymm1 18142; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18143; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18144; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 18145; AVX2-FCP-NEXT: vbroadcastsd 280(%r10), %ymm1 18146; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18147; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18148; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm0 18149; AVX2-FCP-NEXT: vmovaps 288(%rsi), %ymm1 18150; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 18151; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 18152; AVX2-FCP-NEXT: vbroadcastsd 304(%rcx), %ymm3 18153; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 18154; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18155; AVX2-FCP-NEXT: vmovaps 288(%r8), %ymm2 18156; AVX2-FCP-NEXT: vmovaps 288(%r9), %ymm3 18157; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 18158; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 18159; AVX2-FCP-NEXT: vbroadcastsd 304(%rax), %ymm5 18160; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 18161; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18162; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 18163; AVX2-FCP-NEXT: vbroadcastsd 312(%rdx), %ymm1 18164; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18165; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18166; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 18167; AVX2-FCP-NEXT: vbroadcastsd 312(%r10), %ymm1 18168; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18169; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18170; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm0 18171; AVX2-FCP-NEXT: vmovaps 320(%rsi), %ymm1 18172; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 18173; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 18174; AVX2-FCP-NEXT: vbroadcastsd 336(%rcx), %ymm3 18175; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 18176; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18177; AVX2-FCP-NEXT: vmovaps 320(%r8), %ymm2 18178; AVX2-FCP-NEXT: vmovaps 320(%r9), %ymm3 18179; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 18180; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 18181; AVX2-FCP-NEXT: vbroadcastsd 336(%rax), %ymm5 18182; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 18183; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18184; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 18185; AVX2-FCP-NEXT: vbroadcastsd 344(%rdx), %ymm1 18186; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18187; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18188; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 18189; AVX2-FCP-NEXT: vbroadcastsd 344(%r10), %ymm1 18190; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18191; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18192; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm0 18193; AVX2-FCP-NEXT: vmovaps 352(%rsi), %ymm1 18194; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 18195; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 18196; AVX2-FCP-NEXT: vbroadcastsd 368(%rcx), %ymm3 18197; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 18198; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18199; AVX2-FCP-NEXT: vmovaps 352(%r8), %ymm2 18200; AVX2-FCP-NEXT: vmovaps 352(%r9), %ymm3 18201; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 18202; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 18203; AVX2-FCP-NEXT: vbroadcastsd 368(%rax), %ymm5 18204; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 18205; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18206; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 18207; AVX2-FCP-NEXT: vbroadcastsd 376(%rdx), %ymm1 18208; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18209; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18210; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 18211; AVX2-FCP-NEXT: vbroadcastsd 376(%r10), %ymm1 18212; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18213; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18214; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm0 18215; AVX2-FCP-NEXT: vmovaps 384(%rsi), %ymm1 18216; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 18217; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 18218; AVX2-FCP-NEXT: vbroadcastsd 400(%rcx), %ymm3 18219; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 18220; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18221; AVX2-FCP-NEXT: vmovaps 384(%r8), %ymm2 18222; AVX2-FCP-NEXT: vmovaps 384(%r9), %ymm3 18223; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 18224; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 18225; AVX2-FCP-NEXT: vbroadcastsd 400(%rax), %ymm5 18226; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 18227; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18228; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 18229; AVX2-FCP-NEXT: vbroadcastsd 408(%rdx), %ymm1 18230; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18231; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18232; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 18233; AVX2-FCP-NEXT: vbroadcastsd 408(%r10), %ymm1 18234; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18235; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18236; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm0 18237; AVX2-FCP-NEXT: vmovaps 416(%rsi), %ymm1 18238; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 18239; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 18240; AVX2-FCP-NEXT: vbroadcastsd 432(%rcx), %ymm3 18241; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 18242; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18243; AVX2-FCP-NEXT: vmovaps 416(%r8), %ymm2 18244; AVX2-FCP-NEXT: vmovaps 416(%r9), %ymm3 18245; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 18246; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 18247; AVX2-FCP-NEXT: vbroadcastsd 432(%rax), %ymm5 18248; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 18249; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18250; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 18251; AVX2-FCP-NEXT: vbroadcastsd 440(%rdx), %ymm1 18252; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18253; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18254; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 18255; AVX2-FCP-NEXT: vbroadcastsd 440(%r10), %ymm1 18256; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18257; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18258; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm0 18259; AVX2-FCP-NEXT: vmovaps 448(%rsi), %ymm1 18260; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 18261; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 18262; AVX2-FCP-NEXT: vbroadcastsd 464(%rcx), %ymm3 18263; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 18264; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18265; AVX2-FCP-NEXT: vmovaps 448(%r8), %ymm2 18266; AVX2-FCP-NEXT: vmovaps 448(%r9), %ymm3 18267; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 18268; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] 18269; AVX2-FCP-NEXT: vbroadcastsd 464(%rax), %ymm5 18270; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 18271; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18272; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 18273; AVX2-FCP-NEXT: vbroadcastsd 472(%rdx), %ymm1 18274; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18275; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18276; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] 18277; AVX2-FCP-NEXT: vbroadcastsd 472(%r10), %ymm1 18278; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 18279; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18280; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm0 18281; AVX2-FCP-NEXT: vmovaps 480(%rsi), %ymm1 18282; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 18283; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 18284; AVX2-FCP-NEXT: vbroadcastsd 496(%rcx), %ymm3 18285; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 18286; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18287; AVX2-FCP-NEXT: vmovaps 480(%r8), %ymm3 18288; AVX2-FCP-NEXT: vmovaps 480(%r9), %ymm4 18289; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] 18290; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] 18291; AVX2-FCP-NEXT: vbroadcastsd 496(%rax), %ymm5 18292; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm5[6,7] 18293; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 18294; AVX2-FCP-NEXT: vbroadcastsd 504(%rdx), %ymm1 18295; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] 18296; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] 18297; AVX2-FCP-NEXT: vbroadcastsd 504(%r10), %ymm1 18298; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] 18299; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdx 18300; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] 18301; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18302; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],mem[6,7] 18303; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18304; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],mem[6,7] 18305; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18306; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] 18307; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18308; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],mem[6,7] 18309; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18310; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] 18311; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18312; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],mem[6,7] 18313; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18314; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18315; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 18316; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18317; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18318; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 18319; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18320; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18321; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 18322; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18323; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18324; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 18325; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18326; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18327; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 18328; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18329; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 18330; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 18331; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 18332; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18333; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 18334; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18335; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18336; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 18337; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18338; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18339; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 18340; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18341; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18342; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 18343; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 18344; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18345; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] 18346; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18347; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],mem[6,7] 18348; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18349; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],mem[6,7] 18350; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18351; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],mem[6,7] 18352; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18353; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],mem[6,7] 18354; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18355; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] 18356; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18357; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] 18358; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18359; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],mem[6,7] 18360; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18361; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] 18362; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18363; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] 18364; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18365; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] 18366; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18367; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] 18368; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18369; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 18370; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] 18371; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] 18372; AVX2-FCP-NEXT: vmovaps %ymm9, 4064(%rdx) 18373; AVX2-FCP-NEXT: vmovaps %ymm11, 4032(%rdx) 18374; AVX2-FCP-NEXT: vmovaps %ymm15, 4000(%rdx) 18375; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 18376; AVX2-FCP-NEXT: vmovaps %ymm9, 3968(%rdx) 18377; AVX2-FCP-NEXT: vmovaps %ymm0, 3808(%rdx) 18378; AVX2-FCP-NEXT: vmovaps %ymm1, 3776(%rdx) 18379; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18380; AVX2-FCP-NEXT: vmovaps %ymm0, 3744(%rdx) 18381; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18382; AVX2-FCP-NEXT: vmovaps %ymm0, 3712(%rdx) 18383; AVX2-FCP-NEXT: vmovaps %ymm2, 3552(%rdx) 18384; AVX2-FCP-NEXT: vmovaps %ymm3, 3520(%rdx) 18385; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18386; AVX2-FCP-NEXT: vmovaps %ymm0, 3488(%rdx) 18387; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18388; AVX2-FCP-NEXT: vmovaps %ymm0, 3456(%rdx) 18389; AVX2-FCP-NEXT: vmovaps %ymm4, 3296(%rdx) 18390; AVX2-FCP-NEXT: vmovaps %ymm5, 3264(%rdx) 18391; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18392; AVX2-FCP-NEXT: vmovaps %ymm0, 3232(%rdx) 18393; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18394; AVX2-FCP-NEXT: vmovaps %ymm0, 3200(%rdx) 18395; AVX2-FCP-NEXT: vmovaps %ymm6, 3040(%rdx) 18396; AVX2-FCP-NEXT: vmovaps %ymm7, 3008(%rdx) 18397; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18398; AVX2-FCP-NEXT: vmovaps %ymm0, 2976(%rdx) 18399; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18400; AVX2-FCP-NEXT: vmovaps %ymm0, 2944(%rdx) 18401; AVX2-FCP-NEXT: vmovaps %ymm8, 2784(%rdx) 18402; AVX2-FCP-NEXT: vmovaps %ymm10, 2752(%rdx) 18403; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18404; AVX2-FCP-NEXT: vmovaps %ymm0, 2720(%rdx) 18405; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18406; AVX2-FCP-NEXT: vmovaps %ymm0, 2688(%rdx) 18407; AVX2-FCP-NEXT: vmovaps %ymm12, 2528(%rdx) 18408; AVX2-FCP-NEXT: vmovaps %ymm13, 2496(%rdx) 18409; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18410; AVX2-FCP-NEXT: vmovaps %ymm0, 2464(%rdx) 18411; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18412; AVX2-FCP-NEXT: vmovaps %ymm0, 2432(%rdx) 18413; AVX2-FCP-NEXT: vmovaps %ymm14, 2272(%rdx) 18414; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18415; AVX2-FCP-NEXT: vmovaps %ymm0, 2240(%rdx) 18416; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18417; AVX2-FCP-NEXT: vmovaps %ymm0, 2208(%rdx) 18418; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18419; AVX2-FCP-NEXT: vmovaps %ymm0, 2176(%rdx) 18420; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18421; AVX2-FCP-NEXT: vmovaps %ymm0, 2016(%rdx) 18422; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18423; AVX2-FCP-NEXT: vmovaps %ymm0, 1984(%rdx) 18424; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18425; AVX2-FCP-NEXT: vmovaps %ymm0, 1952(%rdx) 18426; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18427; AVX2-FCP-NEXT: vmovaps %ymm0, 1920(%rdx) 18428; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18429; AVX2-FCP-NEXT: vmovaps %ymm0, 1760(%rdx) 18430; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 18431; AVX2-FCP-NEXT: vmovaps %ymm0, 1728(%rdx) 18432; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18433; AVX2-FCP-NEXT: vmovaps %ymm0, 1696(%rdx) 18434; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18435; AVX2-FCP-NEXT: vmovaps %ymm0, 1664(%rdx) 18436; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18437; AVX2-FCP-NEXT: vmovaps %ymm0, 1504(%rdx) 18438; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18439; AVX2-FCP-NEXT: vmovaps %ymm0, 1472(%rdx) 18440; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18441; AVX2-FCP-NEXT: vmovaps %ymm0, 1440(%rdx) 18442; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18443; AVX2-FCP-NEXT: vmovaps %ymm0, 1408(%rdx) 18444; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18445; AVX2-FCP-NEXT: vmovaps %ymm0, 1248(%rdx) 18446; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18447; AVX2-FCP-NEXT: vmovaps %ymm0, 1216(%rdx) 18448; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18449; AVX2-FCP-NEXT: vmovaps %ymm0, 1184(%rdx) 18450; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18451; AVX2-FCP-NEXT: vmovaps %ymm0, 1152(%rdx) 18452; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18453; AVX2-FCP-NEXT: vmovaps %ymm0, 992(%rdx) 18454; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18455; AVX2-FCP-NEXT: vmovaps %ymm0, 960(%rdx) 18456; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18457; AVX2-FCP-NEXT: vmovaps %ymm0, 928(%rdx) 18458; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18459; AVX2-FCP-NEXT: vmovaps %ymm0, 896(%rdx) 18460; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18461; AVX2-FCP-NEXT: vmovaps %ymm0, 736(%rdx) 18462; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18463; AVX2-FCP-NEXT: vmovaps %ymm0, 704(%rdx) 18464; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18465; AVX2-FCP-NEXT: vmovaps %ymm0, 672(%rdx) 18466; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18467; AVX2-FCP-NEXT: vmovaps %ymm0, 640(%rdx) 18468; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18469; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%rdx) 18470; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18471; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%rdx) 18472; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18473; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%rdx) 18474; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18475; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rdx) 18476; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18477; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rdx) 18478; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18479; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rdx) 18480; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18481; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rdx) 18482; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18483; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rdx) 18484; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18485; AVX2-FCP-NEXT: vmovaps %ymm0, 3936(%rdx) 18486; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18487; AVX2-FCP-NEXT: vmovaps %ymm0, 3904(%rdx) 18488; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18489; AVX2-FCP-NEXT: vmovaps %ymm0, 3872(%rdx) 18490; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18491; AVX2-FCP-NEXT: vmovaps %ymm0, 3840(%rdx) 18492; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18493; AVX2-FCP-NEXT: vmovaps %ymm0, 3680(%rdx) 18494; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18495; AVX2-FCP-NEXT: vmovaps %ymm0, 3648(%rdx) 18496; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18497; AVX2-FCP-NEXT: vmovaps %ymm0, 3616(%rdx) 18498; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18499; AVX2-FCP-NEXT: vmovaps %ymm0, 3584(%rdx) 18500; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18501; AVX2-FCP-NEXT: vmovaps %ymm0, 3424(%rdx) 18502; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18503; AVX2-FCP-NEXT: vmovaps %ymm0, 3392(%rdx) 18504; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18505; AVX2-FCP-NEXT: vmovaps %ymm0, 3360(%rdx) 18506; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18507; AVX2-FCP-NEXT: vmovaps %ymm0, 3328(%rdx) 18508; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18509; AVX2-FCP-NEXT: vmovaps %ymm0, 3168(%rdx) 18510; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18511; AVX2-FCP-NEXT: vmovaps %ymm0, 3136(%rdx) 18512; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18513; AVX2-FCP-NEXT: vmovaps %ymm0, 3104(%rdx) 18514; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18515; AVX2-FCP-NEXT: vmovaps %ymm0, 3072(%rdx) 18516; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18517; AVX2-FCP-NEXT: vmovaps %ymm0, 2912(%rdx) 18518; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18519; AVX2-FCP-NEXT: vmovaps %ymm0, 2880(%rdx) 18520; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18521; AVX2-FCP-NEXT: vmovaps %ymm0, 2848(%rdx) 18522; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18523; AVX2-FCP-NEXT: vmovaps %ymm0, 2816(%rdx) 18524; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18525; AVX2-FCP-NEXT: vmovaps %ymm0, 2656(%rdx) 18526; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18527; AVX2-FCP-NEXT: vmovaps %ymm0, 2624(%rdx) 18528; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18529; AVX2-FCP-NEXT: vmovaps %ymm0, 2592(%rdx) 18530; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18531; AVX2-FCP-NEXT: vmovaps %ymm0, 2560(%rdx) 18532; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18533; AVX2-FCP-NEXT: vmovaps %ymm0, 2400(%rdx) 18534; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18535; AVX2-FCP-NEXT: vmovaps %ymm0, 2368(%rdx) 18536; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18537; AVX2-FCP-NEXT: vmovaps %ymm0, 2336(%rdx) 18538; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18539; AVX2-FCP-NEXT: vmovaps %ymm0, 2304(%rdx) 18540; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18541; AVX2-FCP-NEXT: vmovaps %ymm0, 2144(%rdx) 18542; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18543; AVX2-FCP-NEXT: vmovaps %ymm0, 2112(%rdx) 18544; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18545; AVX2-FCP-NEXT: vmovaps %ymm0, 2080(%rdx) 18546; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18547; AVX2-FCP-NEXT: vmovaps %ymm0, 2048(%rdx) 18548; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18549; AVX2-FCP-NEXT: vmovaps %ymm0, 1888(%rdx) 18550; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18551; AVX2-FCP-NEXT: vmovaps %ymm0, 1856(%rdx) 18552; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18553; AVX2-FCP-NEXT: vmovaps %ymm0, 1824(%rdx) 18554; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18555; AVX2-FCP-NEXT: vmovaps %ymm0, 1792(%rdx) 18556; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18557; AVX2-FCP-NEXT: vmovaps %ymm0, 1632(%rdx) 18558; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18559; AVX2-FCP-NEXT: vmovaps %ymm0, 1600(%rdx) 18560; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18561; AVX2-FCP-NEXT: vmovaps %ymm0, 1568(%rdx) 18562; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18563; AVX2-FCP-NEXT: vmovaps %ymm0, 1536(%rdx) 18564; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18565; AVX2-FCP-NEXT: vmovaps %ymm0, 1376(%rdx) 18566; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18567; AVX2-FCP-NEXT: vmovaps %ymm0, 1344(%rdx) 18568; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18569; AVX2-FCP-NEXT: vmovaps %ymm0, 1312(%rdx) 18570; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18571; AVX2-FCP-NEXT: vmovaps %ymm0, 1280(%rdx) 18572; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18573; AVX2-FCP-NEXT: vmovaps %ymm0, 1120(%rdx) 18574; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18575; AVX2-FCP-NEXT: vmovaps %ymm0, 1088(%rdx) 18576; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18577; AVX2-FCP-NEXT: vmovaps %ymm0, 1056(%rdx) 18578; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18579; AVX2-FCP-NEXT: vmovaps %ymm0, 1024(%rdx) 18580; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18581; AVX2-FCP-NEXT: vmovaps %ymm0, 864(%rdx) 18582; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18583; AVX2-FCP-NEXT: vmovaps %ymm0, 832(%rdx) 18584; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18585; AVX2-FCP-NEXT: vmovaps %ymm0, 800(%rdx) 18586; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18587; AVX2-FCP-NEXT: vmovaps %ymm0, 768(%rdx) 18588; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18589; AVX2-FCP-NEXT: vmovaps %ymm0, 608(%rdx) 18590; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18591; AVX2-FCP-NEXT: vmovaps %ymm0, 576(%rdx) 18592; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18593; AVX2-FCP-NEXT: vmovaps %ymm0, 544(%rdx) 18594; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18595; AVX2-FCP-NEXT: vmovaps %ymm0, 512(%rdx) 18596; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18597; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%rdx) 18598; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18599; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rdx) 18600; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18601; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rdx) 18602; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18603; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rdx) 18604; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18605; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rdx) 18606; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18607; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rdx) 18608; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18609; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rdx) 18610; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 18611; AVX2-FCP-NEXT: vmovaps %ymm0, (%rdx) 18612; AVX2-FCP-NEXT: addq $3880, %rsp # imm = 0xF28 18613; AVX2-FCP-NEXT: vzeroupper 18614; AVX2-FCP-NEXT: retq 18615; 18616; AVX512-LABEL: store_i64_stride8_vf64: 18617; AVX512: # %bb.0: 18618; AVX512-NEXT: subq $5384, %rsp # imm = 0x1508 18619; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 18620; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 18621; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm0 18622; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm4 18623; AVX512-NEXT: vmovdqa64 (%rdi), %zmm14 18624; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm1 18625; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 18626; AVX512-NEXT: vmovdqa64 (%rsi), %zmm10 18627; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm2 18628; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm7 18629; AVX512-NEXT: vmovdqa64 (%rdx), %zmm12 18630; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm3 18631; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm9 18632; AVX512-NEXT: vmovdqa64 (%rcx), %zmm15 18633; AVX512-NEXT: vmovdqa64 (%r8), %zmm30 18634; AVX512-NEXT: vmovdqa64 64(%r8), %zmm18 18635; AVX512-NEXT: vmovdqa64 128(%r8), %zmm11 18636; AVX512-NEXT: vmovdqa64 (%r9), %zmm24 18637; AVX512-NEXT: vmovdqa64 64(%r9), %zmm28 18638; AVX512-NEXT: vmovdqa64 128(%r9), %zmm22 18639; AVX512-NEXT: vmovdqa64 (%r10), %zmm26 18640; AVX512-NEXT: vmovdqa64 64(%r10), %zmm31 18641; AVX512-NEXT: vmovdqa64 128(%r10), %zmm16 18642; AVX512-NEXT: vmovdqa64 (%rax), %zmm17 18643; AVX512-NEXT: vmovdqa64 64(%rax), %zmm27 18644; AVX512-NEXT: vmovdqa64 128(%rax), %zmm13 18645; AVX512-NEXT: movb $-64, %r11b 18646; AVX512-NEXT: kmovw %r11d, %k1 18647; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] 18648; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18649; AVX512-NEXT: vmovdqa64 %zmm12, %zmm8 18650; AVX512-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 18651; AVX512-NEXT: vmovdqa64 %zmm14, %zmm6 18652; AVX512-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 18653; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] 18654; AVX512-NEXT: vmovdqa64 %zmm26, %zmm8 18655; AVX512-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 18656; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] 18657; AVX512-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} 18658; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 18659; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18660; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] 18661; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18662; AVX512-NEXT: vmovdqa64 %zmm12, %zmm6 18663; AVX512-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 18664; AVX512-NEXT: vmovdqa64 %zmm14, %zmm8 18665; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 18666; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 18667; AVX512-NEXT: vmovdqa64 %zmm26, %zmm8 18668; AVX512-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 18669; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] 18670; AVX512-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} 18671; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 18672; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18673; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] 18674; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18675; AVX512-NEXT: vmovdqa64 %zmm12, %zmm6 18676; AVX512-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 18677; AVX512-NEXT: vmovdqa64 %zmm14, %zmm8 18678; AVX512-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 18679; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 18680; AVX512-NEXT: vmovdqa64 %zmm30, %zmm8 18681; AVX512-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 18682; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] 18683; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 18684; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18685; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] 18686; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18687; AVX512-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 18688; AVX512-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 18689; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] 18690; AVX512-NEXT: vmovdqa64 %zmm30, %zmm8 18691; AVX512-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 18692; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] 18693; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 18694; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18695; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 18696; AVX512-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 18697; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 18698; AVX512-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 18699; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 18700; AVX512-NEXT: vmovdqa64 %zmm31, %zmm8 18701; AVX512-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 18702; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] 18703; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 18704; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 18705; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18706; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 18707; AVX512-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 18708; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 18709; AVX512-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 18710; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 18711; AVX512-NEXT: vmovdqa64 %zmm31, %zmm8 18712; AVX512-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 18713; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] 18714; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 18715; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 18716; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18717; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 18718; AVX512-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 18719; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 18720; AVX512-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 18721; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 18722; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18723; AVX512-NEXT: vmovdqa64 %zmm18, %zmm8 18724; AVX512-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 18725; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] 18726; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 18727; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18728; AVX512-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 18729; AVX512-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 18730; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] 18731; AVX512-NEXT: vmovdqa64 %zmm18, %zmm5 18732; AVX512-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 18733; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] 18734; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 18735; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18736; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 18737; AVX512-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 18738; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 18739; AVX512-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 18740; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 18741; AVX512-NEXT: vmovdqa64 %zmm16, %zmm5 18742; AVX512-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 18743; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] 18744; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 18745; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 18746; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18747; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 18748; AVX512-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 18749; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 18750; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 18751; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 18752; AVX512-NEXT: vmovdqa64 %zmm16, %zmm5 18753; AVX512-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 18754; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] 18755; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 18756; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 18757; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18758; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 18759; AVX512-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 18760; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 18761; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 18762; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 18763; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 18764; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18765; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18766; AVX512-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 18767; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18768; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18769; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] 18770; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 18771; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18772; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm4 18773; AVX512-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 18774; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm3 18775; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 18776; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm1 18777; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 18778; AVX512-NEXT: vmovdqa64 %zmm11, %zmm2 18779; AVX512-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 18780; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] 18781; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 18782; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18783; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 18784; AVX512-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 18785; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm2 18786; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 18787; AVX512-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 18788; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] 18789; AVX512-NEXT: vmovdqa64 192(%r10), %zmm9 18790; AVX512-NEXT: vmovdqa64 192(%rax), %zmm20 18791; AVX512-NEXT: vmovdqa64 %zmm9, %zmm6 18792; AVX512-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 18793; AVX512-NEXT: vmovdqa64 192(%r8), %zmm8 18794; AVX512-NEXT: vmovdqa64 192(%r9), %zmm0 18795; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] 18796; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 18797; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 18798; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18799; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 18800; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 18801; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 18802; AVX512-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 18803; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 18804; AVX512-NEXT: vmovdqa64 %zmm9, %zmm6 18805; AVX512-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 18806; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] 18807; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 18808; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 18809; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18810; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 18811; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 18812; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 18813; AVX512-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 18814; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 18815; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18816; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 18817; AVX512-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 18818; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] 18819; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 18820; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18821; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 18822; AVX512-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 18823; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] 18824; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 18825; AVX512-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 18826; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] 18827; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 18828; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18829; AVX512-NEXT: vmovdqa64 256(%rdx), %zmm1 18830; AVX512-NEXT: vmovdqa64 256(%rcx), %zmm3 18831; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 18832; AVX512-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 18833; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm4 18834; AVX512-NEXT: vmovdqa64 256(%rsi), %zmm5 18835; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 18836; AVX512-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 18837; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] 18838; AVX512-NEXT: vmovdqa64 256(%r10), %zmm13 18839; AVX512-NEXT: vmovdqa64 256(%rax), %zmm2 18840; AVX512-NEXT: vmovdqa64 %zmm13, %zmm7 18841; AVX512-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 18842; AVX512-NEXT: vmovdqa64 256(%r8), %zmm18 18843; AVX512-NEXT: vmovdqa64 256(%r9), %zmm15 18844; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] 18845; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 18846; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 18847; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18848; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 18849; AVX512-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 18850; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 18851; AVX512-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 18852; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 18853; AVX512-NEXT: vmovdqa64 %zmm13, %zmm7 18854; AVX512-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 18855; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] 18856; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 18857; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 18858; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18859; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 18860; AVX512-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 18861; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 18862; AVX512-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 18863; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 18864; AVX512-NEXT: vmovdqa64 %zmm18, %zmm7 18865; AVX512-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 18866; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] 18867; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 18868; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18869; AVX512-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 18870; AVX512-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 18871; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] 18872; AVX512-NEXT: vmovdqa64 %zmm18, %zmm3 18873; AVX512-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 18874; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] 18875; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 18876; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18877; AVX512-NEXT: vmovdqa64 320(%rdx), %zmm1 18878; AVX512-NEXT: vmovdqa64 320(%rcx), %zmm4 18879; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 18880; AVX512-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 18881; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm6 18882; AVX512-NEXT: vmovdqa64 320(%rsi), %zmm7 18883; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 18884; AVX512-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 18885; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] 18886; AVX512-NEXT: vmovdqa64 320(%r10), %zmm16 18887; AVX512-NEXT: vmovdqa64 320(%rax), %zmm22 18888; AVX512-NEXT: vmovdqa64 %zmm16, %zmm12 18889; AVX512-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 18890; AVX512-NEXT: vmovdqa64 320(%r8), %zmm29 18891; AVX512-NEXT: vmovdqa64 320(%r9), %zmm5 18892; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] 18893; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} 18894; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 18895; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18896; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 18897; AVX512-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 18898; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 18899; AVX512-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 18900; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 18901; AVX512-NEXT: vmovdqa64 %zmm16, %zmm12 18902; AVX512-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 18903; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] 18904; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} 18905; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 18906; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18907; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 18908; AVX512-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 18909; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 18910; AVX512-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 18911; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 18912; AVX512-NEXT: vmovdqa64 %zmm29, %zmm12 18913; AVX512-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 18914; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] 18915; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 18916; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18917; AVX512-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 18918; AVX512-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 18919; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] 18920; AVX512-NEXT: vmovdqa64 %zmm29, %zmm4 18921; AVX512-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 18922; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] 18923; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18924; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18925; AVX512-NEXT: vmovdqa64 384(%rdx), %zmm3 18926; AVX512-NEXT: vmovdqa64 384(%rcx), %zmm1 18927; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 18928; AVX512-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 18929; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18930; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 18931; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 18932; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18933; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 18934; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 18935; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18936; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 18937; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18938; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm3 18939; AVX512-NEXT: vmovdqa64 384(%rsi), %zmm1 18940; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 18941; AVX512-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 18942; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18943; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 18944; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 18945; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18946; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 18947; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 18948; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18949; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 18950; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18951; AVX512-NEXT: vmovdqa64 448(%rdx), %zmm3 18952; AVX512-NEXT: vmovdqa64 448(%rcx), %zmm1 18953; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 18954; AVX512-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 18955; AVX512-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill 18956; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 18957; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 18958; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18959; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 18960; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 18961; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18962; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 18963; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18964; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm3 18965; AVX512-NEXT: vmovdqa64 448(%rsi), %zmm1 18966; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 18967; AVX512-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 18968; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18969; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 18970; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 18971; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18972; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 18973; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 18974; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18975; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 18976; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18977; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] 18978; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18979; AVX512-NEXT: vmovdqa64 %zmm26, %zmm1 18980; AVX512-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 18981; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18982; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] 18983; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18984; AVX512-NEXT: vmovdqa64 %zmm26, %zmm1 18985; AVX512-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 18986; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18987; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] 18988; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18989; AVX512-NEXT: vmovdqa64 %zmm26, %zmm1 18990; AVX512-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 18991; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18992; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] 18993; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18994; AVX512-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 18995; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18996; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 18997; AVX512-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 18998; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18999; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 19000; AVX512-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 19001; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19002; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 19003; AVX512-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 19004; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19005; AVX512-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 19006; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19007; AVX512-NEXT: vmovdqa64 %zmm31, %zmm1 19008; AVX512-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 19009; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19010; AVX512-NEXT: vmovdqa64 %zmm31, %zmm1 19011; AVX512-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 19012; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19013; AVX512-NEXT: vmovdqa64 %zmm31, %zmm1 19014; AVX512-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 19015; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19016; AVX512-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 19017; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19018; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19019; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 19020; AVX512-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 19021; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19022; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 19023; AVX512-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 19024; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19025; AVX512-NEXT: vmovdqa64 %zmm3, %zmm31 19026; AVX512-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 19027; AVX512-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 19028; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19029; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 19030; AVX512-NEXT: vmovdqa64 %zmm28, %zmm3 19031; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19032; AVX512-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 19033; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19034; AVX512-NEXT: vmovdqa64 %zmm28, %zmm3 19035; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 19036; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19037; AVX512-NEXT: vmovdqa64 %zmm28, %zmm3 19038; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 19039; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19040; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 19041; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19042; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19043; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 19044; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19045; AVX512-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 19046; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19047; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 19048; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 19049; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19050; AVX512-NEXT: vmovdqa64 %zmm6, %zmm27 19051; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 19052; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 19053; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19054; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 19055; AVX512-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 19056; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19057; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 19058; AVX512-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 19059; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19060; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 19061; AVX512-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 19062; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19063; AVX512-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 19064; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19065; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19066; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 19067; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 19068; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19069; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 19070; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 19071; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19072; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 19073; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 19074; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 19075; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19076; AVX512-NEXT: vmovdqa64 %zmm13, %zmm0 19077; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 19078; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19079; AVX512-NEXT: vmovdqa64 %zmm13, %zmm0 19080; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 19081; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19082; AVX512-NEXT: vmovdqa64 %zmm13, %zmm0 19083; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 19084; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19085; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 19086; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19087; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 19088; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 19089; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19090; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 19091; AVX512-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 19092; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19093; AVX512-NEXT: vmovdqa64 %zmm18, %zmm28 19094; AVX512-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 19095; AVX512-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 19096; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19097; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 19098; AVX512-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 19099; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19100; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 19101; AVX512-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 19102; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19103; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 19104; AVX512-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 19105; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19106; AVX512-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 19107; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19108; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0 19109; AVX512-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 19110; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19111; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0 19112; AVX512-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 19113; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19114; AVX512-NEXT: vmovdqa64 %zmm29, %zmm22 19115; AVX512-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 19116; AVX512-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 19117; AVX512-NEXT: vmovdqa64 384(%r10), %zmm13 19118; AVX512-NEXT: vmovdqa64 384(%rax), %zmm0 19119; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 19120; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 19121; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19122; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 19123; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 19124; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19125; AVX512-NEXT: vmovdqa64 %zmm13, %zmm14 19126; AVX512-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 19127; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 19128; AVX512-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 19129; AVX512-NEXT: vmovdqa64 384(%r8), %zmm5 19130; AVX512-NEXT: vmovdqa64 384(%r9), %zmm2 19131; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 19132; AVX512-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 19133; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] 19134; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 19135; AVX512-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 19136; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] 19137; AVX512-NEXT: vmovdqa64 %zmm13, %zmm30 19138; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 19139; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 19140; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24 19141; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 19142; AVX512-NEXT: vmovdqa64 %zmm5, %zmm26 19143; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 19144; AVX512-NEXT: vmovdqa64 %zmm5, %zmm17 19145; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 19146; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] 19147; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] 19148; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 19149; AVX512-NEXT: vmovdqa64 448(%r10), %zmm16 19150; AVX512-NEXT: vmovdqa64 448(%rax), %zmm1 19151; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 19152; AVX512-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 19153; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19154; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 19155; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 19156; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19157; AVX512-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 19158; AVX512-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 19159; AVX512-NEXT: vmovdqa64 448(%r8), %zmm3 19160; AVX512-NEXT: vmovdqa64 448(%r9), %zmm0 19161; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 19162; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] 19163; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 19164; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] 19165; AVX512-NEXT: vmovdqa64 %zmm16, %zmm18 19166; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 19167; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 19168; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 19169; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 19170; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 19171; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] 19172; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] 19173; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 19174; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 19175; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 19176; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 19177; AVX512-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} 19178; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 19179; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19180; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 19181; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 19182; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 19183; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} 19184; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 19185; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19186; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 19187; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 19188; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 19189; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 19190; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19191; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 19192; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 19193; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 19194; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 19195; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19196; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 19197; AVX512-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 19198; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 19199; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} 19200; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 19201; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19202; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 19203; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 19204; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 19205; AVX512-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} 19206; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 19207; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19208; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 19209; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 19210; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 19211; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 19212; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19213; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 19214; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 19215; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 19216; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 19217; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19218; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 19219; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19220; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} 19221; AVX512-NEXT: vmovdqa (%rcx), %ymm0 19222; AVX512-NEXT: vmovdqa (%rdx), %ymm1 19223; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 19224; AVX512-NEXT: vmovdqa (%rsi), %ymm4 19225; AVX512-NEXT: vmovdqa (%rdi), %ymm6 19226; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 19227; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 19228; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 19229; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill 19230; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 19231; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 19232; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 19233; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 19234; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 19235; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 19236; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 19237; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19238; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 19239; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} 19240; AVX512-NEXT: vmovdqa 64(%rcx), %ymm0 19241; AVX512-NEXT: vmovdqa 64(%rdx), %ymm1 19242; AVX512-NEXT: vmovdqa 64(%rsi), %ymm2 19243; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4 19244; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 19245; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] 19246; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] 19247; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 19248; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19249; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19250; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 19251; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 19252; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 19253; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] 19254; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 19255; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 19256; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 19257; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} 19258; AVX512-NEXT: vmovdqa 128(%rcx), %ymm0 19259; AVX512-NEXT: vmovdqa 128(%rdx), %ymm1 19260; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 19261; AVX512-NEXT: vmovdqa 128(%rsi), %ymm4 19262; AVX512-NEXT: vmovdqa 128(%rdi), %ymm6 19263; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 19264; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 19265; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 19266; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 19267; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 19268; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 19269; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 19270; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 19271; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 19272; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 19273; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 19274; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} 19275; AVX512-NEXT: vmovdqa 192(%rcx), %ymm0 19276; AVX512-NEXT: vmovdqa 192(%rdx), %ymm1 19277; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 19278; AVX512-NEXT: vmovdqa 192(%rsi), %ymm4 19279; AVX512-NEXT: vmovdqa 192(%rdi), %ymm6 19280; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 19281; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 19282; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 19283; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 19284; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 19285; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 19286; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 19287; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 19288; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 19289; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 19290; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 19291; AVX512-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} 19292; AVX512-NEXT: vmovdqa 256(%rcx), %ymm0 19293; AVX512-NEXT: vmovdqa 256(%rdx), %ymm1 19294; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 19295; AVX512-NEXT: vmovdqa 256(%rsi), %ymm4 19296; AVX512-NEXT: vmovdqa 256(%rdi), %ymm6 19297; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 19298; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 19299; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 19300; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 19301; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 19302; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 19303; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 19304; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 19305; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 19306; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 19307; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 19308; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} 19309; AVX512-NEXT: vmovdqa 320(%rcx), %ymm0 19310; AVX512-NEXT: vmovdqa 320(%rdx), %ymm1 19311; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 19312; AVX512-NEXT: vmovdqa 320(%rsi), %ymm4 19313; AVX512-NEXT: vmovdqa 320(%rdi), %ymm6 19314; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 19315; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 19316; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 19317; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 19318; AVX512-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} 19319; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 19320; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 19321; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 19322; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 19323; AVX512-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} 19324; AVX512-NEXT: vmovdqa 384(%rcx), %ymm0 19325; AVX512-NEXT: vmovdqa 384(%rdx), %ymm1 19326; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 19327; AVX512-NEXT: vmovdqa 384(%rsi), %ymm4 19328; AVX512-NEXT: vmovdqa 384(%rdi), %ymm6 19329; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 19330; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 19331; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 19332; AVX512-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} 19333; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 19334; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 19335; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 19336; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 19337; AVX512-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} 19338; AVX512-NEXT: vmovdqa 448(%rcx), %ymm0 19339; AVX512-NEXT: vmovdqa 448(%rdx), %ymm1 19340; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 19341; AVX512-NEXT: vmovdqa 448(%rsi), %ymm4 19342; AVX512-NEXT: vmovdqa 448(%rdi), %ymm5 19343; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] 19344; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] 19345; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 19346; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 19347; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] 19348; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 19349; AVX512-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} 19350; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 19351; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 19352; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19353; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 19354; AVX512-NEXT: vmovdqa (%rsi), %xmm0 19355; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 19356; AVX512-NEXT: vmovdqa (%rdi), %xmm0 19357; AVX512-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 19358; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] 19359; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 19360; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19361; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 19362; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 19363; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] 19364; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 19365; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19366; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 19367; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} 19368; AVX512-NEXT: vmovdqa 64(%rsi), %xmm1 19369; AVX512-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 19370; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3 19371; AVX512-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 19372; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] 19373; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 19374; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 19375; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19376; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 19377; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] 19378; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 19379; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19380; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 19381; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} 19382; AVX512-NEXT: vmovdqa 128(%rsi), %xmm1 19383; AVX512-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 19384; AVX512-NEXT: vmovdqa 128(%rdi), %xmm4 19385; AVX512-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 19386; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] 19387; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 19388; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 19389; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 19390; AVX512-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} 19391; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] 19392; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 19393; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19394; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 19395; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} 19396; AVX512-NEXT: vmovdqa 192(%rsi), %xmm1 19397; AVX512-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 19398; AVX512-NEXT: vmovdqa 192(%rdi), %xmm7 19399; AVX512-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 19400; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] 19401; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 19402; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 19403; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 19404; AVX512-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} 19405; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] 19406; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 19407; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 19408; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 19409; AVX512-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} 19410; AVX512-NEXT: vmovdqa64 256(%rsi), %xmm16 19411; AVX512-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 19412; AVX512-NEXT: vmovdqa64 256(%rdi), %xmm17 19413; AVX512-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 19414; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] 19415; AVX512-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 19416; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 19417; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 19418; AVX512-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} 19419; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] 19420; AVX512-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 19421; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 19422; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 19423; AVX512-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} 19424; AVX512-NEXT: vmovdqa64 320(%rsi), %xmm16 19425; AVX512-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 19426; AVX512-NEXT: vmovdqa64 320(%rdi), %xmm17 19427; AVX512-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 19428; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] 19429; AVX512-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 19430; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 19431; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 19432; AVX512-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} 19433; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] 19434; AVX512-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 19435; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 19436; AVX512-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} 19437; AVX512-NEXT: vmovdqa64 384(%rsi), %xmm17 19438; AVX512-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 19439; AVX512-NEXT: vmovdqa64 384(%rdi), %xmm20 19440; AVX512-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 19441; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] 19442; AVX512-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 19443; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 19444; AVX512-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} 19445; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] 19446; AVX512-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 19447; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 19448; AVX512-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} 19449; AVX512-NEXT: vmovdqa64 448(%rsi), %xmm20 19450; AVX512-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 19451; AVX512-NEXT: vmovdqa64 448(%rdi), %xmm26 19452; AVX512-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 19453; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] 19454; AVX512-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 19455; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 19456; AVX512-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} 19457; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] 19458; AVX512-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 19459; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 19460; AVX512-NEXT: vmovdqa64 %zmm9, 3776(%rax) 19461; AVX512-NEXT: vmovdqa64 %zmm10, 3712(%rax) 19462; AVX512-NEXT: vmovdqa64 %zmm13, 3264(%rax) 19463; AVX512-NEXT: vmovdqa64 %zmm14, 3200(%rax) 19464; AVX512-NEXT: vmovdqa64 %zmm27, 2752(%rax) 19465; AVX512-NEXT: vmovdqa64 %zmm22, 2688(%rax) 19466; AVX512-NEXT: vmovdqa64 %zmm28, 2240(%rax) 19467; AVX512-NEXT: vmovdqa64 %zmm15, 2176(%rax) 19468; AVX512-NEXT: vmovdqa64 %zmm19, 1728(%rax) 19469; AVX512-NEXT: vmovdqa64 %zmm21, 1664(%rax) 19470; AVX512-NEXT: vmovdqa64 %zmm25, 1216(%rax) 19471; AVX512-NEXT: vmovdqa64 %zmm23, 1152(%rax) 19472; AVX512-NEXT: vmovdqa64 %zmm8, 704(%rax) 19473; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19474; AVX512-NEXT: vmovaps %zmm8, 640(%rax) 19475; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19476; AVX512-NEXT: vmovaps %zmm8, 192(%rax) 19477; AVX512-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload 19478; AVX512-NEXT: vmovaps %zmm8, 128(%rax) 19479; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19480; AVX512-NEXT: vmovaps %zmm8, 4032(%rax) 19481; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19482; AVX512-NEXT: vmovaps %zmm8, 3968(%rax) 19483; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19484; AVX512-NEXT: vmovaps %zmm8, 3904(%rax) 19485; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19486; AVX512-NEXT: vmovaps %zmm8, 3840(%rax) 19487; AVX512-NEXT: vmovdqa64 %zmm12, 3648(%rax) 19488; AVX512-NEXT: vmovdqa64 %zmm11, 3584(%rax) 19489; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19490; AVX512-NEXT: vmovaps %zmm8, 3520(%rax) 19491; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19492; AVX512-NEXT: vmovaps %zmm8, 3456(%rax) 19493; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19494; AVX512-NEXT: vmovaps %zmm8, 3392(%rax) 19495; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19496; AVX512-NEXT: vmovaps %zmm8, 3328(%rax) 19497; AVX512-NEXT: vmovdqa64 %zmm17, 3136(%rax) 19498; AVX512-NEXT: vmovdqa64 %zmm24, 3072(%rax) 19499; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19500; AVX512-NEXT: vmovaps %zmm8, 3008(%rax) 19501; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19502; AVX512-NEXT: vmovaps %zmm8, 2944(%rax) 19503; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19504; AVX512-NEXT: vmovaps %zmm8, 2880(%rax) 19505; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19506; AVX512-NEXT: vmovaps %zmm8, 2816(%rax) 19507; AVX512-NEXT: vmovdqa64 %zmm16, 2624(%rax) 19508; AVX512-NEXT: vmovdqa64 %zmm18, 2560(%rax) 19509; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19510; AVX512-NEXT: vmovaps %zmm8, 2496(%rax) 19511; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19512; AVX512-NEXT: vmovaps %zmm8, 2432(%rax) 19513; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19514; AVX512-NEXT: vmovaps %zmm8, 2368(%rax) 19515; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19516; AVX512-NEXT: vmovaps %zmm8, 2304(%rax) 19517; AVX512-NEXT: vmovdqa64 %zmm30, 2112(%rax) 19518; AVX512-NEXT: vmovdqa64 %zmm29, 2048(%rax) 19519; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19520; AVX512-NEXT: vmovaps %zmm8, 1984(%rax) 19521; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19522; AVX512-NEXT: vmovaps %zmm8, 1920(%rax) 19523; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19524; AVX512-NEXT: vmovaps %zmm8, 1856(%rax) 19525; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19526; AVX512-NEXT: vmovaps %zmm8, 1792(%rax) 19527; AVX512-NEXT: vmovdqa64 %zmm1, 1600(%rax) 19528; AVX512-NEXT: vmovdqa64 %zmm7, 1536(%rax) 19529; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19530; AVX512-NEXT: vmovaps %zmm1, 1472(%rax) 19531; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19532; AVX512-NEXT: vmovaps %zmm1, 1408(%rax) 19533; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19534; AVX512-NEXT: vmovaps %zmm1, 1344(%rax) 19535; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19536; AVX512-NEXT: vmovaps %zmm1, 1280(%rax) 19537; AVX512-NEXT: vmovdqa64 %zmm6, 1088(%rax) 19538; AVX512-NEXT: vmovdqa64 %zmm4, 1024(%rax) 19539; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19540; AVX512-NEXT: vmovaps %zmm1, 960(%rax) 19541; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19542; AVX512-NEXT: vmovaps %zmm1, 896(%rax) 19543; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19544; AVX512-NEXT: vmovaps %zmm1, 832(%rax) 19545; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19546; AVX512-NEXT: vmovaps %zmm1, 768(%rax) 19547; AVX512-NEXT: vmovdqa64 %zmm5, 576(%rax) 19548; AVX512-NEXT: vmovdqa64 %zmm3, 512(%rax) 19549; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19550; AVX512-NEXT: vmovaps %zmm1, 448(%rax) 19551; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19552; AVX512-NEXT: vmovaps %zmm1, 384(%rax) 19553; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19554; AVX512-NEXT: vmovaps %zmm1, 320(%rax) 19555; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19556; AVX512-NEXT: vmovaps %zmm1, 256(%rax) 19557; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax) 19558; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) 19559; AVX512-NEXT: addq $5384, %rsp # imm = 0x1508 19560; AVX512-NEXT: vzeroupper 19561; AVX512-NEXT: retq 19562; 19563; AVX512-FCP-LABEL: store_i64_stride8_vf64: 19564; AVX512-FCP: # %bb.0: 19565; AVX512-FCP-NEXT: subq $5384, %rsp # imm = 0x1508 19566; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 19567; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 19568; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 19569; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 19570; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm14 19571; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm1 19572; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 19573; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm10 19574; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm2 19575; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 19576; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm12 19577; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm3 19578; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 19579; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm15 19580; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm30 19581; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm18 19582; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 19583; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm24 19584; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 19585; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 19586; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm26 19587; AVX512-FCP-NEXT: vmovdqa64 64(%r10), %zmm31 19588; AVX512-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 19589; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm17 19590; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 19591; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 19592; AVX512-FCP-NEXT: movb $-64, %r11b 19593; AVX512-FCP-NEXT: kmovw %r11d, %k1 19594; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] 19595; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19596; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 19597; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 19598; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 19599; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 19600; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] 19601; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 19602; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 19603; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] 19604; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} 19605; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 19606; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19607; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] 19608; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19609; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 19610; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 19611; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 19612; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 19613; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 19614; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 19615; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 19616; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] 19617; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} 19618; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 19619; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19620; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] 19621; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19622; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 19623; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 19624; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 19625; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 19626; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 19627; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 19628; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 19629; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] 19630; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 19631; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19632; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] 19633; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19634; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 19635; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 19636; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] 19637; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 19638; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 19639; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] 19640; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 19641; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19642; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 19643; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 19644; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 19645; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 19646; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 19647; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 19648; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 19649; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] 19650; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 19651; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 19652; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19653; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 19654; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 19655; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 19656; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 19657; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 19658; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 19659; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 19660; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] 19661; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 19662; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 19663; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19664; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 19665; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 19666; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 19667; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 19668; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 19669; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19670; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 19671; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 19672; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] 19673; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 19674; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19675; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 19676; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 19677; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] 19678; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 19679; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 19680; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] 19681; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 19682; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19683; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 19684; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 19685; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 19686; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 19687; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 19688; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 19689; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 19690; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] 19691; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 19692; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 19693; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19694; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 19695; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 19696; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 19697; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 19698; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 19699; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 19700; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 19701; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] 19702; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 19703; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 19704; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19705; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 19706; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 19707; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 19708; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 19709; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 19710; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 19711; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19712; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19713; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 19714; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19715; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19716; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] 19717; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 19718; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19719; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 19720; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 19721; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm3 19722; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 19723; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 19724; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 19725; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 19726; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 19727; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] 19728; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 19729; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19730; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 19731; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 19732; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 19733; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 19734; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 19735; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] 19736; AVX512-FCP-NEXT: vmovdqa64 192(%r10), %zmm9 19737; AVX512-FCP-NEXT: vmovdqa64 192(%rax), %zmm20 19738; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 19739; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 19740; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm8 19741; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 19742; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] 19743; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 19744; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 19745; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19746; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 19747; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 19748; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 19749; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 19750; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 19751; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 19752; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 19753; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] 19754; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 19755; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 19756; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19757; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 19758; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 19759; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 19760; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 19761; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 19762; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19763; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 19764; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 19765; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] 19766; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 19767; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19768; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 19769; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 19770; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] 19771; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 19772; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 19773; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] 19774; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 19775; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19776; AVX512-FCP-NEXT: vmovdqa64 256(%rdx), %zmm1 19777; AVX512-FCP-NEXT: vmovdqa64 256(%rcx), %zmm3 19778; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 19779; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 19780; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 19781; AVX512-FCP-NEXT: vmovdqa64 256(%rsi), %zmm5 19782; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 19783; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 19784; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] 19785; AVX512-FCP-NEXT: vmovdqa64 256(%r10), %zmm13 19786; AVX512-FCP-NEXT: vmovdqa64 256(%rax), %zmm2 19787; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 19788; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 19789; AVX512-FCP-NEXT: vmovdqa64 256(%r8), %zmm18 19790; AVX512-FCP-NEXT: vmovdqa64 256(%r9), %zmm15 19791; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] 19792; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 19793; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 19794; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19795; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 19796; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 19797; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 19798; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 19799; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 19800; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 19801; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 19802; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] 19803; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 19804; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 19805; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19806; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 19807; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 19808; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 19809; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 19810; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 19811; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 19812; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 19813; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] 19814; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 19815; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19816; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 19817; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 19818; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] 19819; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 19820; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 19821; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] 19822; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 19823; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19824; AVX512-FCP-NEXT: vmovdqa64 320(%rdx), %zmm1 19825; AVX512-FCP-NEXT: vmovdqa64 320(%rcx), %zmm4 19826; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 19827; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 19828; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 19829; AVX512-FCP-NEXT: vmovdqa64 320(%rsi), %zmm7 19830; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 19831; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 19832; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] 19833; AVX512-FCP-NEXT: vmovdqa64 320(%r10), %zmm16 19834; AVX512-FCP-NEXT: vmovdqa64 320(%rax), %zmm22 19835; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 19836; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 19837; AVX512-FCP-NEXT: vmovdqa64 320(%r8), %zmm29 19838; AVX512-FCP-NEXT: vmovdqa64 320(%r9), %zmm5 19839; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] 19840; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} 19841; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 19842; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19843; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 19844; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 19845; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 19846; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 19847; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 19848; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 19849; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 19850; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] 19851; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} 19852; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 19853; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19854; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 19855; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 19856; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 19857; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 19858; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 19859; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm12 19860; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 19861; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] 19862; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 19863; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19864; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 19865; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 19866; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] 19867; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 19868; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 19869; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] 19870; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19871; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19872; AVX512-FCP-NEXT: vmovdqa64 384(%rdx), %zmm3 19873; AVX512-FCP-NEXT: vmovdqa64 384(%rcx), %zmm1 19874; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 19875; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 19876; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19877; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 19878; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 19879; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19880; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 19881; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 19882; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19883; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 19884; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19885; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 19886; AVX512-FCP-NEXT: vmovdqa64 384(%rsi), %zmm1 19887; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 19888; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 19889; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19890; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 19891; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 19892; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19893; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 19894; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 19895; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19896; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 19897; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19898; AVX512-FCP-NEXT: vmovdqa64 448(%rdx), %zmm3 19899; AVX512-FCP-NEXT: vmovdqa64 448(%rcx), %zmm1 19900; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 19901; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 19902; AVX512-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill 19903; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 19904; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 19905; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19906; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 19907; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 19908; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19909; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 19910; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19911; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 19912; AVX512-FCP-NEXT: vmovdqa64 448(%rsi), %zmm1 19913; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 19914; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 19915; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19916; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 19917; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 19918; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19919; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 19920; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 19921; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19922; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 19923; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19924; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] 19925; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19926; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 19927; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 19928; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19929; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] 19930; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19931; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 19932; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 19933; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19934; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] 19935; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19936; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 19937; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 19938; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19939; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] 19940; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19941; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 19942; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19943; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 19944; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 19945; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19946; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 19947; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 19948; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19949; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 19950; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 19951; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19952; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 19953; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19954; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 19955; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 19956; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19957; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 19958; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 19959; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19960; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 19961; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 19962; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19963; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 19964; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19965; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19966; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 19967; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 19968; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19969; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 19970; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 19971; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19972; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 19973; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 19974; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 19975; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19976; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 19977; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 19978; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19979; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 19980; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19981; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 19982; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 19983; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19984; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 19985; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 19986; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19987; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 19988; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19989; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19990; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 19991; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19992; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 19993; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19994; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 19995; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 19996; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19997; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 19998; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 19999; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 20000; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20001; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 20002; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 20003; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20004; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 20005; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 20006; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20007; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 20008; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 20009; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20010; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 20011; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20012; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20013; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 20014; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 20015; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20016; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 20017; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 20018; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20019; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 20020; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 20021; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 20022; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20023; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 20024; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 20025; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20026; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 20027; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 20028; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20029; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 20030; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 20031; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20032; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 20033; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20034; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 20035; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 20036; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20037; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 20038; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 20039; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20040; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm28 20041; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 20042; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 20043; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20044; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 20045; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 20046; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20047; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 20048; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 20049; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20050; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 20051; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 20052; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20053; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 20054; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20055; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 20056; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 20057; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20058; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 20059; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 20060; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20061; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 20062; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 20063; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 20064; AVX512-FCP-NEXT: vmovdqa64 384(%r10), %zmm13 20065; AVX512-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 20066; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 20067; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 20068; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20069; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 20070; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 20071; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20072; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 20073; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 20074; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 20075; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 20076; AVX512-FCP-NEXT: vmovdqa64 384(%r8), %zmm5 20077; AVX512-FCP-NEXT: vmovdqa64 384(%r9), %zmm2 20078; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 20079; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 20080; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] 20081; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 20082; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 20083; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] 20084; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 20085; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 20086; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 20087; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 20088; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 20089; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm26 20090; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 20091; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 20092; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 20093; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] 20094; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] 20095; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 20096; AVX512-FCP-NEXT: vmovdqa64 448(%r10), %zmm16 20097; AVX512-FCP-NEXT: vmovdqa64 448(%rax), %zmm1 20098; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 20099; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 20100; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20101; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 20102; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 20103; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20104; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 20105; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 20106; AVX512-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 20107; AVX512-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 20108; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 20109; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] 20110; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 20111; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] 20112; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 20113; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 20114; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 20115; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 20116; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 20117; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 20118; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] 20119; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] 20120; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 20121; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 20122; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 20123; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 20124; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} 20125; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 20126; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20127; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 20128; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 20129; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 20130; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} 20131; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 20132; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20133; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 20134; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 20135; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 20136; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 20137; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20138; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 20139; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 20140; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 20141; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 20142; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20143; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 20144; AVX512-FCP-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 20145; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 20146; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} 20147; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 20148; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20149; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 20150; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 20151; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 20152; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} 20153; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 20154; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20155; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 20156; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 20157; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 20158; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 20159; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20160; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 20161; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 20162; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 20163; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 20164; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20165; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 20166; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20167; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} 20168; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm0 20169; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm1 20170; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 20171; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm4 20172; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6 20173; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 20174; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 20175; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 20176; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill 20177; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 20178; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 20179; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 20180; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 20181; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 20182; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 20183; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 20184; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20185; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 20186; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} 20187; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm0 20188; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm1 20189; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm2 20190; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 20191; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 20192; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] 20193; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] 20194; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 20195; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20196; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20197; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 20198; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 20199; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 20200; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] 20201; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 20202; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 20203; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 20204; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} 20205; AVX512-FCP-NEXT: vmovdqa 128(%rcx), %ymm0 20206; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %ymm1 20207; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 20208; AVX512-FCP-NEXT: vmovdqa 128(%rsi), %ymm4 20209; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 20210; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 20211; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 20212; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 20213; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 20214; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 20215; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 20216; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 20217; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 20218; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 20219; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 20220; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 20221; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} 20222; AVX512-FCP-NEXT: vmovdqa 192(%rcx), %ymm0 20223; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %ymm1 20224; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 20225; AVX512-FCP-NEXT: vmovdqa 192(%rsi), %ymm4 20226; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 20227; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 20228; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 20229; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 20230; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 20231; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 20232; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 20233; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 20234; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 20235; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 20236; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 20237; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 20238; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} 20239; AVX512-FCP-NEXT: vmovdqa 256(%rcx), %ymm0 20240; AVX512-FCP-NEXT: vmovdqa 256(%rdx), %ymm1 20241; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 20242; AVX512-FCP-NEXT: vmovdqa 256(%rsi), %ymm4 20243; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 20244; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 20245; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 20246; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 20247; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 20248; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 20249; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 20250; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 20251; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 20252; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 20253; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 20254; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 20255; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} 20256; AVX512-FCP-NEXT: vmovdqa 320(%rcx), %ymm0 20257; AVX512-FCP-NEXT: vmovdqa 320(%rdx), %ymm1 20258; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 20259; AVX512-FCP-NEXT: vmovdqa 320(%rsi), %ymm4 20260; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm6 20261; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 20262; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 20263; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 20264; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 20265; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} 20266; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 20267; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 20268; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 20269; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 20270; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} 20271; AVX512-FCP-NEXT: vmovdqa 384(%rcx), %ymm0 20272; AVX512-FCP-NEXT: vmovdqa 384(%rdx), %ymm1 20273; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 20274; AVX512-FCP-NEXT: vmovdqa 384(%rsi), %ymm4 20275; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 20276; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 20277; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 20278; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 20279; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} 20280; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 20281; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 20282; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 20283; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 20284; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} 20285; AVX512-FCP-NEXT: vmovdqa 448(%rcx), %ymm0 20286; AVX512-FCP-NEXT: vmovdqa 448(%rdx), %ymm1 20287; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 20288; AVX512-FCP-NEXT: vmovdqa 448(%rsi), %ymm4 20289; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm5 20290; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] 20291; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] 20292; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 20293; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 20294; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] 20295; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 20296; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} 20297; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 20298; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 20299; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 20300; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 20301; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0 20302; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 20303; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 20304; AVX512-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 20305; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] 20306; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 20307; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 20308; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 20309; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 20310; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] 20311; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 20312; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20313; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 20314; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} 20315; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm1 20316; AVX512-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 20317; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 20318; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 20319; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] 20320; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 20321; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 20322; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20323; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 20324; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] 20325; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 20326; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20327; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 20328; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} 20329; AVX512-FCP-NEXT: vmovdqa 128(%rsi), %xmm1 20330; AVX512-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 20331; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm4 20332; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 20333; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] 20334; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 20335; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 20336; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 20337; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} 20338; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] 20339; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 20340; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20341; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 20342; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} 20343; AVX512-FCP-NEXT: vmovdqa 192(%rsi), %xmm1 20344; AVX512-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 20345; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 20346; AVX512-FCP-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 20347; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] 20348; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 20349; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 20350; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 20351; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} 20352; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] 20353; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 20354; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 20355; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 20356; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} 20357; AVX512-FCP-NEXT: vmovdqa64 256(%rsi), %xmm16 20358; AVX512-FCP-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 20359; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %xmm17 20360; AVX512-FCP-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 20361; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] 20362; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 20363; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 20364; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 20365; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} 20366; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] 20367; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 20368; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 20369; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 20370; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} 20371; AVX512-FCP-NEXT: vmovdqa64 320(%rsi), %xmm16 20372; AVX512-FCP-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 20373; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %xmm17 20374; AVX512-FCP-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 20375; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] 20376; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 20377; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 20378; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 20379; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} 20380; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] 20381; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 20382; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 20383; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} 20384; AVX512-FCP-NEXT: vmovdqa64 384(%rsi), %xmm17 20385; AVX512-FCP-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 20386; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %xmm20 20387; AVX512-FCP-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 20388; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] 20389; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 20390; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 20391; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} 20392; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] 20393; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 20394; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 20395; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} 20396; AVX512-FCP-NEXT: vmovdqa64 448(%rsi), %xmm20 20397; AVX512-FCP-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 20398; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %xmm26 20399; AVX512-FCP-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 20400; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] 20401; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 20402; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 20403; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} 20404; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] 20405; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 20406; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 20407; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 3776(%rax) 20408; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 3712(%rax) 20409; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 3264(%rax) 20410; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 3200(%rax) 20411; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 2752(%rax) 20412; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 2688(%rax) 20413; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 2240(%rax) 20414; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 2176(%rax) 20415; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 1728(%rax) 20416; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 1664(%rax) 20417; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 1216(%rax) 20418; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 1152(%rax) 20419; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 704(%rax) 20420; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20421; AVX512-FCP-NEXT: vmovaps %zmm8, 640(%rax) 20422; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20423; AVX512-FCP-NEXT: vmovaps %zmm8, 192(%rax) 20424; AVX512-FCP-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload 20425; AVX512-FCP-NEXT: vmovaps %zmm8, 128(%rax) 20426; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20427; AVX512-FCP-NEXT: vmovaps %zmm8, 4032(%rax) 20428; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20429; AVX512-FCP-NEXT: vmovaps %zmm8, 3968(%rax) 20430; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20431; AVX512-FCP-NEXT: vmovaps %zmm8, 3904(%rax) 20432; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20433; AVX512-FCP-NEXT: vmovaps %zmm8, 3840(%rax) 20434; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 3648(%rax) 20435; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 3584(%rax) 20436; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20437; AVX512-FCP-NEXT: vmovaps %zmm8, 3520(%rax) 20438; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20439; AVX512-FCP-NEXT: vmovaps %zmm8, 3456(%rax) 20440; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20441; AVX512-FCP-NEXT: vmovaps %zmm8, 3392(%rax) 20442; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20443; AVX512-FCP-NEXT: vmovaps %zmm8, 3328(%rax) 20444; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 3136(%rax) 20445; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 3072(%rax) 20446; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20447; AVX512-FCP-NEXT: vmovaps %zmm8, 3008(%rax) 20448; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20449; AVX512-FCP-NEXT: vmovaps %zmm8, 2944(%rax) 20450; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20451; AVX512-FCP-NEXT: vmovaps %zmm8, 2880(%rax) 20452; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20453; AVX512-FCP-NEXT: vmovaps %zmm8, 2816(%rax) 20454; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 2624(%rax) 20455; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 2560(%rax) 20456; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20457; AVX512-FCP-NEXT: vmovaps %zmm8, 2496(%rax) 20458; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20459; AVX512-FCP-NEXT: vmovaps %zmm8, 2432(%rax) 20460; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20461; AVX512-FCP-NEXT: vmovaps %zmm8, 2368(%rax) 20462; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20463; AVX512-FCP-NEXT: vmovaps %zmm8, 2304(%rax) 20464; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 2112(%rax) 20465; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 2048(%rax) 20466; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20467; AVX512-FCP-NEXT: vmovaps %zmm8, 1984(%rax) 20468; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20469; AVX512-FCP-NEXT: vmovaps %zmm8, 1920(%rax) 20470; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20471; AVX512-FCP-NEXT: vmovaps %zmm8, 1856(%rax) 20472; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20473; AVX512-FCP-NEXT: vmovaps %zmm8, 1792(%rax) 20474; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 1600(%rax) 20475; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 1536(%rax) 20476; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20477; AVX512-FCP-NEXT: vmovaps %zmm1, 1472(%rax) 20478; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20479; AVX512-FCP-NEXT: vmovaps %zmm1, 1408(%rax) 20480; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20481; AVX512-FCP-NEXT: vmovaps %zmm1, 1344(%rax) 20482; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20483; AVX512-FCP-NEXT: vmovaps %zmm1, 1280(%rax) 20484; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 1088(%rax) 20485; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 1024(%rax) 20486; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20487; AVX512-FCP-NEXT: vmovaps %zmm1, 960(%rax) 20488; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20489; AVX512-FCP-NEXT: vmovaps %zmm1, 896(%rax) 20490; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20491; AVX512-FCP-NEXT: vmovaps %zmm1, 832(%rax) 20492; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20493; AVX512-FCP-NEXT: vmovaps %zmm1, 768(%rax) 20494; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 576(%rax) 20495; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 512(%rax) 20496; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20497; AVX512-FCP-NEXT: vmovaps %zmm1, 448(%rax) 20498; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20499; AVX512-FCP-NEXT: vmovaps %zmm1, 384(%rax) 20500; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20501; AVX512-FCP-NEXT: vmovaps %zmm1, 320(%rax) 20502; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20503; AVX512-FCP-NEXT: vmovaps %zmm1, 256(%rax) 20504; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) 20505; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 20506; AVX512-FCP-NEXT: addq $5384, %rsp # imm = 0x1508 20507; AVX512-FCP-NEXT: vzeroupper 20508; AVX512-FCP-NEXT: retq 20509; 20510; AVX512DQ-LABEL: store_i64_stride8_vf64: 20511; AVX512DQ: # %bb.0: 20512; AVX512DQ-NEXT: subq $5384, %rsp # imm = 0x1508 20513; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 20514; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 20515; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm0 20516; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm4 20517; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm14 20518; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm1 20519; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm5 20520; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm10 20521; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm2 20522; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm7 20523; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm12 20524; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm3 20525; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm9 20526; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm15 20527; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm30 20528; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm18 20529; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm11 20530; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm24 20531; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm28 20532; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm22 20533; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm26 20534; AVX512DQ-NEXT: vmovdqa64 64(%r10), %zmm31 20535; AVX512DQ-NEXT: vmovdqa64 128(%r10), %zmm16 20536; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm17 20537; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm27 20538; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm13 20539; AVX512DQ-NEXT: movb $-64, %r11b 20540; AVX512DQ-NEXT: kmovw %r11d, %k1 20541; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] 20542; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 20543; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm8 20544; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 20545; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm6 20546; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 20547; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] 20548; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm8 20549; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 20550; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] 20551; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} 20552; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 20553; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20554; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] 20555; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 20556; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm6 20557; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 20558; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm8 20559; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 20560; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 20561; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm8 20562; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 20563; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] 20564; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} 20565; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 20566; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20567; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] 20568; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 20569; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm6 20570; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 20571; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm8 20572; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 20573; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 20574; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm8 20575; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 20576; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] 20577; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 20578; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20579; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] 20580; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 20581; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 20582; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 20583; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] 20584; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm8 20585; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 20586; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] 20587; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 20588; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20589; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 20590; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 20591; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 20592; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 20593; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 20594; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm8 20595; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 20596; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] 20597; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 20598; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 20599; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20600; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 20601; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 20602; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 20603; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 20604; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 20605; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm8 20606; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 20607; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] 20608; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 20609; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 20610; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20611; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 20612; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 20613; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 20614; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 20615; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 20616; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20617; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm8 20618; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 20619; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] 20620; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 20621; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20622; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 20623; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 20624; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] 20625; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm5 20626; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 20627; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] 20628; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 20629; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20630; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 20631; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 20632; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 20633; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 20634; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 20635; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm5 20636; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 20637; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] 20638; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 20639; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 20640; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20641; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 20642; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 20643; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 20644; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 20645; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 20646; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm5 20647; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 20648; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] 20649; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 20650; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 20651; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20652; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 20653; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 20654; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 20655; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 20656; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 20657; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 20658; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20659; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20660; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 20661; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20662; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20663; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] 20664; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 20665; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20666; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm4 20667; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 20668; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm3 20669; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 20670; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm1 20671; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 20672; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm2 20673; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 20674; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] 20675; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 20676; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20677; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 20678; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 20679; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm2 20680; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 20681; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 20682; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] 20683; AVX512DQ-NEXT: vmovdqa64 192(%r10), %zmm9 20684; AVX512DQ-NEXT: vmovdqa64 192(%rax), %zmm20 20685; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm6 20686; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 20687; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm8 20688; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm0 20689; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] 20690; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 20691; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 20692; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20693; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 20694; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 20695; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 20696; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 20697; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 20698; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm6 20699; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 20700; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] 20701; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 20702; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 20703; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20704; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 20705; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 20706; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 20707; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 20708; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 20709; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20710; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 20711; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 20712; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] 20713; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 20714; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20715; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 20716; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 20717; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] 20718; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 20719; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 20720; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] 20721; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 20722; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20723; AVX512DQ-NEXT: vmovdqa64 256(%rdx), %zmm1 20724; AVX512DQ-NEXT: vmovdqa64 256(%rcx), %zmm3 20725; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 20726; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 20727; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm4 20728; AVX512DQ-NEXT: vmovdqa64 256(%rsi), %zmm5 20729; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 20730; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 20731; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] 20732; AVX512DQ-NEXT: vmovdqa64 256(%r10), %zmm13 20733; AVX512DQ-NEXT: vmovdqa64 256(%rax), %zmm2 20734; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm7 20735; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 20736; AVX512DQ-NEXT: vmovdqa64 256(%r8), %zmm18 20737; AVX512DQ-NEXT: vmovdqa64 256(%r9), %zmm15 20738; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] 20739; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 20740; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 20741; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20742; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 20743; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 20744; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 20745; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 20746; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 20747; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm7 20748; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 20749; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] 20750; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 20751; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 20752; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20753; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 20754; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 20755; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 20756; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 20757; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 20758; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm7 20759; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 20760; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] 20761; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 20762; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20763; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 20764; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 20765; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] 20766; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm3 20767; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 20768; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] 20769; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 20770; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20771; AVX512DQ-NEXT: vmovdqa64 320(%rdx), %zmm1 20772; AVX512DQ-NEXT: vmovdqa64 320(%rcx), %zmm4 20773; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 20774; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 20775; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm6 20776; AVX512DQ-NEXT: vmovdqa64 320(%rsi), %zmm7 20777; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 20778; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 20779; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] 20780; AVX512DQ-NEXT: vmovdqa64 320(%r10), %zmm16 20781; AVX512DQ-NEXT: vmovdqa64 320(%rax), %zmm22 20782; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm12 20783; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 20784; AVX512DQ-NEXT: vmovdqa64 320(%r8), %zmm29 20785; AVX512DQ-NEXT: vmovdqa64 320(%r9), %zmm5 20786; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] 20787; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} 20788; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 20789; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20790; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 20791; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 20792; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 20793; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 20794; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 20795; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm12 20796; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 20797; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] 20798; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} 20799; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 20800; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20801; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 20802; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 20803; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 20804; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 20805; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 20806; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm12 20807; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 20808; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] 20809; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 20810; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20811; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 20812; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 20813; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] 20814; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm4 20815; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 20816; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] 20817; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 20818; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20819; AVX512DQ-NEXT: vmovdqa64 384(%rdx), %zmm3 20820; AVX512DQ-NEXT: vmovdqa64 384(%rcx), %zmm1 20821; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 20822; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 20823; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20824; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 20825; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 20826; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20827; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 20828; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 20829; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20830; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 20831; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20832; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm3 20833; AVX512DQ-NEXT: vmovdqa64 384(%rsi), %zmm1 20834; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 20835; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 20836; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20837; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 20838; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 20839; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20840; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 20841; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 20842; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20843; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 20844; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20845; AVX512DQ-NEXT: vmovdqa64 448(%rdx), %zmm3 20846; AVX512DQ-NEXT: vmovdqa64 448(%rcx), %zmm1 20847; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 20848; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 20849; AVX512DQ-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill 20850; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 20851; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 20852; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20853; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 20854; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 20855; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20856; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 20857; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20858; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm3 20859; AVX512DQ-NEXT: vmovdqa64 448(%rsi), %zmm1 20860; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 20861; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 20862; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20863; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 20864; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 20865; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20866; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 20867; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 20868; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20869; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 20870; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20871; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] 20872; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 20873; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm1 20874; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 20875; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20876; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] 20877; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 20878; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm1 20879; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 20880; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20881; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] 20882; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 20883; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm1 20884; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 20885; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20886; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] 20887; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 20888; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 20889; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20890; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 20891; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 20892; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20893; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 20894; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 20895; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20896; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 20897; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 20898; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20899; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 20900; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20901; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm1 20902; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 20903; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20904; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm1 20905; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 20906; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20907; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm1 20908; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 20909; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20910; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 20911; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20912; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 20913; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 20914; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 20915; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20916; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 20917; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 20918; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20919; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm31 20920; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 20921; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 20922; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20923; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 20924; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm3 20925; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20926; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 20927; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20928; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm3 20929; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 20930; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20931; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm3 20932; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 20933; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20934; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 20935; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20936; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20937; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 20938; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20939; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 20940; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20941; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 20942; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 20943; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20944; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm27 20945; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 20946; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 20947; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20948; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 20949; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 20950; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20951; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 20952; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 20953; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20954; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 20955; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 20956; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20957; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 20958; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20959; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20960; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 20961; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 20962; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20963; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 20964; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 20965; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20966; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 20967; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 20968; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 20969; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20970; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm0 20971; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 20972; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20973; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm0 20974; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 20975; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20976; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm0 20977; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 20978; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20979; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 20980; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20981; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 20982; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 20983; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20984; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 20985; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 20986; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20987; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm28 20988; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 20989; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 20990; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20991; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 20992; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 20993; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20994; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 20995; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 20996; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20997; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 20998; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 20999; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21000; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 21001; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21002; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0 21003; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 21004; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21005; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0 21006; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 21007; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21008; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm22 21009; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 21010; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 21011; AVX512DQ-NEXT: vmovdqa64 384(%r10), %zmm13 21012; AVX512DQ-NEXT: vmovdqa64 384(%rax), %zmm0 21013; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 21014; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 21015; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21016; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 21017; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 21018; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21019; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm14 21020; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 21021; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 21022; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 21023; AVX512DQ-NEXT: vmovdqa64 384(%r8), %zmm5 21024; AVX512DQ-NEXT: vmovdqa64 384(%r9), %zmm2 21025; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm8 21026; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 21027; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] 21028; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 21029; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 21030; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] 21031; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm30 21032; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 21033; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 21034; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm24 21035; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 21036; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm26 21037; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 21038; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm17 21039; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 21040; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] 21041; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] 21042; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 21043; AVX512DQ-NEXT: vmovdqa64 448(%r10), %zmm16 21044; AVX512DQ-NEXT: vmovdqa64 448(%rax), %zmm1 21045; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 21046; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 21047; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21048; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 21049; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 21050; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21051; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 21052; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 21053; AVX512DQ-NEXT: vmovdqa64 448(%r8), %zmm3 21054; AVX512DQ-NEXT: vmovdqa64 448(%r9), %zmm0 21055; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 21056; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] 21057; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 21058; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] 21059; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm18 21060; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 21061; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 21062; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 21063; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 21064; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 21065; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] 21066; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] 21067; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 21068; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 21069; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 21070; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 21071; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} 21072; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 21073; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21074; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 21075; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 21076; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 21077; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} 21078; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 21079; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21080; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 21081; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 21082; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 21083; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 21084; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21085; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 21086; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 21087; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 21088; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 21089; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21090; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 21091; AVX512DQ-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 21092; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 21093; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} 21094; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 21095; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21096; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 21097; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 21098; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 21099; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} 21100; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 21101; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21102; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 21103; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 21104; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 21105; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 21106; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21107; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 21108; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 21109; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 21110; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 21111; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21112; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 21113; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21114; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} 21115; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm0 21116; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 21117; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 21118; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm4 21119; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6 21120; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 21121; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 21122; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 21123; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill 21124; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 21125; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 21126; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 21127; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 21128; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 21129; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 21130; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 21131; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21132; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 21133; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} 21134; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm0 21135; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm1 21136; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm2 21137; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4 21138; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 21139; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] 21140; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] 21141; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 21142; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21143; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 21144; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 21145; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 21146; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 21147; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] 21148; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 21149; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 21150; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 21151; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} 21152; AVX512DQ-NEXT: vmovdqa 128(%rcx), %ymm0 21153; AVX512DQ-NEXT: vmovdqa 128(%rdx), %ymm1 21154; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 21155; AVX512DQ-NEXT: vmovdqa 128(%rsi), %ymm4 21156; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm6 21157; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 21158; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 21159; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 21160; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 21161; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 21162; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 21163; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 21164; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 21165; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 21166; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 21167; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 21168; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} 21169; AVX512DQ-NEXT: vmovdqa 192(%rcx), %ymm0 21170; AVX512DQ-NEXT: vmovdqa 192(%rdx), %ymm1 21171; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 21172; AVX512DQ-NEXT: vmovdqa 192(%rsi), %ymm4 21173; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm6 21174; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 21175; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 21176; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 21177; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 21178; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 21179; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 21180; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 21181; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 21182; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 21183; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 21184; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 21185; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} 21186; AVX512DQ-NEXT: vmovdqa 256(%rcx), %ymm0 21187; AVX512DQ-NEXT: vmovdqa 256(%rdx), %ymm1 21188; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 21189; AVX512DQ-NEXT: vmovdqa 256(%rsi), %ymm4 21190; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm6 21191; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 21192; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 21193; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 21194; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 21195; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 21196; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 21197; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 21198; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 21199; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 21200; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 21201; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 21202; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} 21203; AVX512DQ-NEXT: vmovdqa 320(%rcx), %ymm0 21204; AVX512DQ-NEXT: vmovdqa 320(%rdx), %ymm1 21205; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 21206; AVX512DQ-NEXT: vmovdqa 320(%rsi), %ymm4 21207; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm6 21208; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 21209; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 21210; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 21211; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 21212; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} 21213; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 21214; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 21215; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 21216; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 21217; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} 21218; AVX512DQ-NEXT: vmovdqa 384(%rcx), %ymm0 21219; AVX512DQ-NEXT: vmovdqa 384(%rdx), %ymm1 21220; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 21221; AVX512DQ-NEXT: vmovdqa 384(%rsi), %ymm4 21222; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm6 21223; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 21224; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 21225; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 21226; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} 21227; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 21228; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 21229; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 21230; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 21231; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} 21232; AVX512DQ-NEXT: vmovdqa 448(%rcx), %ymm0 21233; AVX512DQ-NEXT: vmovdqa 448(%rdx), %ymm1 21234; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 21235; AVX512DQ-NEXT: vmovdqa 448(%rsi), %ymm4 21236; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm5 21237; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] 21238; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] 21239; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 21240; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 21241; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] 21242; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 21243; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} 21244; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 21245; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 21246; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 21247; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 21248; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 21249; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 21250; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 21251; AVX512DQ-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 21252; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] 21253; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 21254; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 21255; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 21256; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 21257; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] 21258; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 21259; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21260; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 21261; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} 21262; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm1 21263; AVX512DQ-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 21264; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm3 21265; AVX512DQ-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 21266; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] 21267; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 21268; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 21269; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 21270; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 21271; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] 21272; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 21273; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21274; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 21275; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} 21276; AVX512DQ-NEXT: vmovdqa 128(%rsi), %xmm1 21277; AVX512DQ-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 21278; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm4 21279; AVX512DQ-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 21280; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] 21281; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 21282; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 21283; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 21284; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} 21285; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] 21286; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 21287; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21288; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 21289; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} 21290; AVX512DQ-NEXT: vmovdqa 192(%rsi), %xmm1 21291; AVX512DQ-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 21292; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm7 21293; AVX512DQ-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 21294; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] 21295; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 21296; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 21297; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 21298; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} 21299; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] 21300; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 21301; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 21302; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 21303; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} 21304; AVX512DQ-NEXT: vmovdqa64 256(%rsi), %xmm16 21305; AVX512DQ-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 21306; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %xmm17 21307; AVX512DQ-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 21308; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] 21309; AVX512DQ-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 21310; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 21311; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 21312; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} 21313; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] 21314; AVX512DQ-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 21315; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 21316; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 21317; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} 21318; AVX512DQ-NEXT: vmovdqa64 320(%rsi), %xmm16 21319; AVX512DQ-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 21320; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %xmm17 21321; AVX512DQ-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 21322; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] 21323; AVX512DQ-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 21324; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 21325; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 21326; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} 21327; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] 21328; AVX512DQ-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 21329; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 21330; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} 21331; AVX512DQ-NEXT: vmovdqa64 384(%rsi), %xmm17 21332; AVX512DQ-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 21333; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %xmm20 21334; AVX512DQ-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 21335; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] 21336; AVX512DQ-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 21337; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 21338; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} 21339; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] 21340; AVX512DQ-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 21341; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 21342; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} 21343; AVX512DQ-NEXT: vmovdqa64 448(%rsi), %xmm20 21344; AVX512DQ-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 21345; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %xmm26 21346; AVX512DQ-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 21347; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] 21348; AVX512DQ-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 21349; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 21350; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} 21351; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] 21352; AVX512DQ-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 21353; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 21354; AVX512DQ-NEXT: vmovdqa64 %zmm9, 3776(%rax) 21355; AVX512DQ-NEXT: vmovdqa64 %zmm10, 3712(%rax) 21356; AVX512DQ-NEXT: vmovdqa64 %zmm13, 3264(%rax) 21357; AVX512DQ-NEXT: vmovdqa64 %zmm14, 3200(%rax) 21358; AVX512DQ-NEXT: vmovdqa64 %zmm27, 2752(%rax) 21359; AVX512DQ-NEXT: vmovdqa64 %zmm22, 2688(%rax) 21360; AVX512DQ-NEXT: vmovdqa64 %zmm28, 2240(%rax) 21361; AVX512DQ-NEXT: vmovdqa64 %zmm15, 2176(%rax) 21362; AVX512DQ-NEXT: vmovdqa64 %zmm19, 1728(%rax) 21363; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1664(%rax) 21364; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1216(%rax) 21365; AVX512DQ-NEXT: vmovdqa64 %zmm23, 1152(%rax) 21366; AVX512DQ-NEXT: vmovdqa64 %zmm8, 704(%rax) 21367; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21368; AVX512DQ-NEXT: vmovaps %zmm8, 640(%rax) 21369; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21370; AVX512DQ-NEXT: vmovaps %zmm8, 192(%rax) 21371; AVX512DQ-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload 21372; AVX512DQ-NEXT: vmovaps %zmm8, 128(%rax) 21373; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21374; AVX512DQ-NEXT: vmovaps %zmm8, 4032(%rax) 21375; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21376; AVX512DQ-NEXT: vmovaps %zmm8, 3968(%rax) 21377; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21378; AVX512DQ-NEXT: vmovaps %zmm8, 3904(%rax) 21379; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21380; AVX512DQ-NEXT: vmovaps %zmm8, 3840(%rax) 21381; AVX512DQ-NEXT: vmovdqa64 %zmm12, 3648(%rax) 21382; AVX512DQ-NEXT: vmovdqa64 %zmm11, 3584(%rax) 21383; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21384; AVX512DQ-NEXT: vmovaps %zmm8, 3520(%rax) 21385; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21386; AVX512DQ-NEXT: vmovaps %zmm8, 3456(%rax) 21387; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21388; AVX512DQ-NEXT: vmovaps %zmm8, 3392(%rax) 21389; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21390; AVX512DQ-NEXT: vmovaps %zmm8, 3328(%rax) 21391; AVX512DQ-NEXT: vmovdqa64 %zmm17, 3136(%rax) 21392; AVX512DQ-NEXT: vmovdqa64 %zmm24, 3072(%rax) 21393; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21394; AVX512DQ-NEXT: vmovaps %zmm8, 3008(%rax) 21395; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21396; AVX512DQ-NEXT: vmovaps %zmm8, 2944(%rax) 21397; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21398; AVX512DQ-NEXT: vmovaps %zmm8, 2880(%rax) 21399; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21400; AVX512DQ-NEXT: vmovaps %zmm8, 2816(%rax) 21401; AVX512DQ-NEXT: vmovdqa64 %zmm16, 2624(%rax) 21402; AVX512DQ-NEXT: vmovdqa64 %zmm18, 2560(%rax) 21403; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21404; AVX512DQ-NEXT: vmovaps %zmm8, 2496(%rax) 21405; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21406; AVX512DQ-NEXT: vmovaps %zmm8, 2432(%rax) 21407; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21408; AVX512DQ-NEXT: vmovaps %zmm8, 2368(%rax) 21409; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21410; AVX512DQ-NEXT: vmovaps %zmm8, 2304(%rax) 21411; AVX512DQ-NEXT: vmovdqa64 %zmm30, 2112(%rax) 21412; AVX512DQ-NEXT: vmovdqa64 %zmm29, 2048(%rax) 21413; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21414; AVX512DQ-NEXT: vmovaps %zmm8, 1984(%rax) 21415; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21416; AVX512DQ-NEXT: vmovaps %zmm8, 1920(%rax) 21417; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21418; AVX512DQ-NEXT: vmovaps %zmm8, 1856(%rax) 21419; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 21420; AVX512DQ-NEXT: vmovaps %zmm8, 1792(%rax) 21421; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1600(%rax) 21422; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1536(%rax) 21423; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21424; AVX512DQ-NEXT: vmovaps %zmm1, 1472(%rax) 21425; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21426; AVX512DQ-NEXT: vmovaps %zmm1, 1408(%rax) 21427; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21428; AVX512DQ-NEXT: vmovaps %zmm1, 1344(%rax) 21429; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21430; AVX512DQ-NEXT: vmovaps %zmm1, 1280(%rax) 21431; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1088(%rax) 21432; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1024(%rax) 21433; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21434; AVX512DQ-NEXT: vmovaps %zmm1, 960(%rax) 21435; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21436; AVX512DQ-NEXT: vmovaps %zmm1, 896(%rax) 21437; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21438; AVX512DQ-NEXT: vmovaps %zmm1, 832(%rax) 21439; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21440; AVX512DQ-NEXT: vmovaps %zmm1, 768(%rax) 21441; AVX512DQ-NEXT: vmovdqa64 %zmm5, 576(%rax) 21442; AVX512DQ-NEXT: vmovdqa64 %zmm3, 512(%rax) 21443; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21444; AVX512DQ-NEXT: vmovaps %zmm1, 448(%rax) 21445; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21446; AVX512DQ-NEXT: vmovaps %zmm1, 384(%rax) 21447; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21448; AVX512DQ-NEXT: vmovaps %zmm1, 320(%rax) 21449; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21450; AVX512DQ-NEXT: vmovaps %zmm1, 256(%rax) 21451; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax) 21452; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) 21453; AVX512DQ-NEXT: addq $5384, %rsp # imm = 0x1508 21454; AVX512DQ-NEXT: vzeroupper 21455; AVX512DQ-NEXT: retq 21456; 21457; AVX512DQ-FCP-LABEL: store_i64_stride8_vf64: 21458; AVX512DQ-FCP: # %bb.0: 21459; AVX512DQ-FCP-NEXT: subq $5384, %rsp # imm = 0x1508 21460; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 21461; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 21462; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 21463; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 21464; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm14 21465; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm1 21466; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 21467; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm10 21468; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm2 21469; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 21470; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm12 21471; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm3 21472; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 21473; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm15 21474; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm30 21475; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm18 21476; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 21477; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm24 21478; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 21479; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 21480; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm26 21481; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r10), %zmm31 21482; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 21483; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm17 21484; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 21485; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 21486; AVX512DQ-FCP-NEXT: movb $-64, %r11b 21487; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1 21488; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] 21489; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 21490; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 21491; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 21492; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 21493; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 21494; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] 21495; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 21496; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 21497; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] 21498; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} 21499; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 21500; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21501; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] 21502; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 21503; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 21504; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 21505; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 21506; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 21507; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 21508; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 21509; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 21510; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] 21511; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} 21512; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 21513; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21514; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] 21515; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 21516; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 21517; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 21518; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 21519; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 21520; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 21521; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 21522; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 21523; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] 21524; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 21525; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21526; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] 21527; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 21528; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 21529; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 21530; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] 21531; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 21532; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 21533; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] 21534; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 21535; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21536; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 21537; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 21538; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 21539; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 21540; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 21541; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 21542; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 21543; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] 21544; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 21545; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 21546; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21547; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 21548; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 21549; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 21550; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 21551; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 21552; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 21553; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 21554; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] 21555; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 21556; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 21557; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21558; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 21559; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 21560; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 21561; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 21562; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 21563; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21564; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 21565; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 21566; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] 21567; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 21568; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21569; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 21570; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 21571; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] 21572; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 21573; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 21574; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] 21575; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 21576; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21577; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 21578; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 21579; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 21580; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 21581; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 21582; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 21583; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 21584; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] 21585; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 21586; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 21587; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21588; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 21589; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 21590; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 21591; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 21592; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 21593; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 21594; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 21595; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] 21596; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 21597; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 21598; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21599; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 21600; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 21601; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 21602; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 21603; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 21604; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 21605; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21606; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21607; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 21608; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21609; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21610; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] 21611; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 21612; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21613; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 21614; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 21615; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm3 21616; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 21617; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 21618; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 21619; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 21620; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 21621; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] 21622; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 21623; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21624; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 21625; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 21626; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 21627; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 21628; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 21629; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] 21630; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r10), %zmm9 21631; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rax), %zmm20 21632; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 21633; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 21634; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm8 21635; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 21636; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] 21637; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 21638; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 21639; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21640; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 21641; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 21642; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 21643; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 21644; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 21645; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 21646; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 21647; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] 21648; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 21649; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 21650; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21651; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 21652; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 21653; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 21654; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 21655; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 21656; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21657; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 21658; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 21659; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] 21660; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 21661; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21662; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 21663; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 21664; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] 21665; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 21666; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 21667; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] 21668; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 21669; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21670; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdx), %zmm1 21671; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rcx), %zmm3 21672; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 21673; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 21674; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 21675; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rsi), %zmm5 21676; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 21677; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 21678; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] 21679; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r10), %zmm13 21680; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rax), %zmm2 21681; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 21682; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 21683; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r8), %zmm18 21684; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r9), %zmm15 21685; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] 21686; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 21687; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 21688; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21689; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 21690; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 21691; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 21692; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 21693; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 21694; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 21695; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 21696; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] 21697; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 21698; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 21699; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21700; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 21701; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 21702; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 21703; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 21704; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 21705; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 21706; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 21707; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] 21708; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 21709; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21710; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 21711; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 21712; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] 21713; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 21714; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 21715; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] 21716; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 21717; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21718; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdx), %zmm1 21719; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rcx), %zmm4 21720; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 21721; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 21722; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 21723; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rsi), %zmm7 21724; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 21725; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 21726; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] 21727; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r10), %zmm16 21728; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rax), %zmm22 21729; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 21730; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 21731; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r8), %zmm29 21732; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r9), %zmm5 21733; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] 21734; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} 21735; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 21736; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21737; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 21738; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 21739; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 21740; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 21741; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 21742; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 21743; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 21744; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] 21745; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} 21746; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 21747; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21748; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 21749; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 21750; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 21751; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 21752; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 21753; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm12 21754; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 21755; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] 21756; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 21757; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21758; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 21759; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 21760; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] 21761; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 21762; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 21763; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] 21764; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 21765; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21766; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdx), %zmm3 21767; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rcx), %zmm1 21768; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 21769; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 21770; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21771; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 21772; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 21773; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21774; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 21775; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 21776; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21777; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 21778; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21779; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 21780; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rsi), %zmm1 21781; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 21782; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 21783; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21784; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 21785; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 21786; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21787; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 21788; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 21789; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21790; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 21791; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21792; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdx), %zmm3 21793; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rcx), %zmm1 21794; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 21795; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 21796; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill 21797; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 21798; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 21799; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21800; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 21801; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 21802; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21803; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 21804; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21805; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 21806; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rsi), %zmm1 21807; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 21808; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 21809; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21810; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 21811; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 21812; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21813; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 21814; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 21815; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21816; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 21817; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21818; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] 21819; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 21820; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 21821; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 21822; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21823; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] 21824; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 21825; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 21826; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 21827; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21828; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] 21829; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 21830; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 21831; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 21832; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21833; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] 21834; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 21835; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 21836; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21837; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 21838; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 21839; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21840; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 21841; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 21842; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21843; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 21844; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 21845; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21846; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 21847; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21848; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 21849; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 21850; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21851; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 21852; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 21853; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21854; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 21855; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 21856; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21857; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 21858; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21859; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 21860; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 21861; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 21862; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21863; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 21864; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 21865; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21866; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 21867; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 21868; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 21869; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21870; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 21871; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 21872; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21873; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 21874; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21875; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 21876; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 21877; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21878; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 21879; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 21880; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21881; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 21882; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21883; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 21884; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 21885; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21886; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 21887; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21888; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 21889; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 21890; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21891; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 21892; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 21893; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 21894; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21895; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 21896; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 21897; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21898; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 21899; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 21900; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21901; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 21902; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 21903; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21904; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 21905; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21906; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 21907; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 21908; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 21909; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21910; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 21911; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 21912; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21913; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 21914; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 21915; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 21916; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21917; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 21918; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 21919; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21920; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 21921; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 21922; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21923; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 21924; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 21925; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21926; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 21927; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21928; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 21929; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 21930; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21931; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 21932; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 21933; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21934; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm28 21935; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 21936; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 21937; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21938; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 21939; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 21940; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21941; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 21942; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 21943; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21944; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 21945; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 21946; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21947; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 21948; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21949; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 21950; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 21951; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21952; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 21953; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 21954; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21955; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 21956; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 21957; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 21958; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r10), %zmm13 21959; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 21960; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 21961; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 21962; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21963; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 21964; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 21965; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21966; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 21967; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 21968; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 21969; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 21970; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r8), %zmm5 21971; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r9), %zmm2 21972; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 21973; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 21974; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] 21975; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 21976; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 21977; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] 21978; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 21979; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 21980; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 21981; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 21982; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 21983; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm26 21984; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 21985; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 21986; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 21987; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] 21988; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] 21989; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 21990; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r10), %zmm16 21991; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rax), %zmm1 21992; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 21993; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 21994; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21995; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 21996; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 21997; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 21998; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 21999; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 22000; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 22001; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 22002; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 22003; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] 22004; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 22005; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] 22006; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 22007; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 22008; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 22009; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 22010; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 22011; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 22012; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] 22013; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] 22014; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 22015; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22016; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 22017; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 22018; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} 22019; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 22020; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22021; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22022; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 22023; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 22024; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} 22025; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 22026; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22027; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22028; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 22029; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 22030; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 22031; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22032; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22033; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 22034; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 22035; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 22036; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22037; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22038; AVX512DQ-FCP-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 22039; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 22040; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} 22041; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 22042; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22043; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22044; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 22045; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 22046; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} 22047; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 22048; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22049; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22050; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 22051; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 22052; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 22053; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22054; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22055; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 22056; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 22057; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 22058; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22059; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22060; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22061; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} 22062; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm0 22063; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm1 22064; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 22065; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm4 22066; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6 22067; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 22068; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 22069; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 22070; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill 22071; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 22072; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 22073; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 22074; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 22075; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 22076; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 22077; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 22078; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22079; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22080; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} 22081; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm0 22082; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm1 22083; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm2 22084; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 22085; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 22086; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] 22087; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] 22088; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 22089; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22090; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 22091; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 22092; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 22093; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 22094; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] 22095; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 22096; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 22097; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22098; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} 22099; AVX512DQ-FCP-NEXT: vmovdqa 128(%rcx), %ymm0 22100; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdx), %ymm1 22101; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 22102; AVX512DQ-FCP-NEXT: vmovdqa 128(%rsi), %ymm4 22103; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 22104; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 22105; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 22106; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 22107; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 22108; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 22109; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 22110; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 22111; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 22112; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 22113; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 22114; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22115; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} 22116; AVX512DQ-FCP-NEXT: vmovdqa 192(%rcx), %ymm0 22117; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %ymm1 22118; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 22119; AVX512DQ-FCP-NEXT: vmovdqa 192(%rsi), %ymm4 22120; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 22121; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 22122; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 22123; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 22124; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 22125; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 22126; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 22127; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 22128; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 22129; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 22130; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 22131; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22132; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} 22133; AVX512DQ-FCP-NEXT: vmovdqa 256(%rcx), %ymm0 22134; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdx), %ymm1 22135; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 22136; AVX512DQ-FCP-NEXT: vmovdqa 256(%rsi), %ymm4 22137; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 22138; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 22139; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 22140; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 22141; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 22142; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 22143; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 22144; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 22145; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 22146; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 22147; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 22148; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22149; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} 22150; AVX512DQ-FCP-NEXT: vmovdqa 320(%rcx), %ymm0 22151; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdx), %ymm1 22152; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 22153; AVX512DQ-FCP-NEXT: vmovdqa 320(%rsi), %ymm4 22154; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm6 22155; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 22156; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 22157; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 22158; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 22159; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} 22160; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 22161; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 22162; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 22163; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 22164; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} 22165; AVX512DQ-FCP-NEXT: vmovdqa 384(%rcx), %ymm0 22166; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdx), %ymm1 22167; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 22168; AVX512DQ-FCP-NEXT: vmovdqa 384(%rsi), %ymm4 22169; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 22170; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 22171; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 22172; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 22173; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} 22174; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 22175; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 22176; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 22177; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 22178; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} 22179; AVX512DQ-FCP-NEXT: vmovdqa 448(%rcx), %ymm0 22180; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdx), %ymm1 22181; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 22182; AVX512DQ-FCP-NEXT: vmovdqa 448(%rsi), %ymm4 22183; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm5 22184; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] 22185; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] 22186; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 22187; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 22188; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] 22189; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 22190; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} 22191; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 22192; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22193; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 22194; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 22195; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0 22196; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 22197; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 22198; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 22199; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] 22200; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 22201; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 22202; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 22203; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 22204; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] 22205; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 22206; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22207; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 22208; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} 22209; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm1 22210; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 22211; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 22212; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 22213; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] 22214; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 22215; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 22216; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 22217; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 22218; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] 22219; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 22220; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22221; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 22222; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} 22223; AVX512DQ-FCP-NEXT: vmovdqa 128(%rsi), %xmm1 22224; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 22225; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm4 22226; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 22227; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] 22228; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 22229; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 22230; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 22231; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} 22232; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] 22233; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 22234; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22235; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 22236; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} 22237; AVX512DQ-FCP-NEXT: vmovdqa 192(%rsi), %xmm1 22238; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 22239; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 22240; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 22241; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] 22242; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 22243; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 22244; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 22245; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} 22246; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] 22247; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 22248; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 22249; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 22250; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} 22251; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rsi), %xmm16 22252; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 22253; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %xmm17 22254; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 22255; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] 22256; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 22257; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 22258; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 22259; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} 22260; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] 22261; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 22262; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 22263; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 22264; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} 22265; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rsi), %xmm16 22266; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 22267; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %xmm17 22268; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 22269; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] 22270; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 22271; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 22272; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 22273; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} 22274; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] 22275; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 22276; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 22277; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} 22278; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rsi), %xmm17 22279; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 22280; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %xmm20 22281; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 22282; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] 22283; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 22284; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 22285; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} 22286; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] 22287; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 22288; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 22289; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} 22290; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rsi), %xmm20 22291; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 22292; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %xmm26 22293; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 22294; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] 22295; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 22296; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 22297; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} 22298; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] 22299; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 22300; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 22301; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 3776(%rax) 22302; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 3712(%rax) 22303; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 3264(%rax) 22304; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 3200(%rax) 22305; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 2752(%rax) 22306; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 2688(%rax) 22307; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 2240(%rax) 22308; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 2176(%rax) 22309; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 1728(%rax) 22310; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 1664(%rax) 22311; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 1216(%rax) 22312; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 1152(%rax) 22313; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 704(%rax) 22314; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22315; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 640(%rax) 22316; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22317; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 192(%rax) 22318; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload 22319; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 128(%rax) 22320; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22321; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 4032(%rax) 22322; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22323; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 3968(%rax) 22324; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22325; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 3904(%rax) 22326; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22327; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 3840(%rax) 22328; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 3648(%rax) 22329; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 3584(%rax) 22330; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22331; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 3520(%rax) 22332; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22333; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 3456(%rax) 22334; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22335; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 3392(%rax) 22336; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22337; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 3328(%rax) 22338; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 3136(%rax) 22339; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 3072(%rax) 22340; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22341; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 3008(%rax) 22342; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22343; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2944(%rax) 22344; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22345; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2880(%rax) 22346; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22347; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2816(%rax) 22348; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 2624(%rax) 22349; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 2560(%rax) 22350; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22351; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2496(%rax) 22352; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22353; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2432(%rax) 22354; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22355; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2368(%rax) 22356; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22357; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2304(%rax) 22358; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 2112(%rax) 22359; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 2048(%rax) 22360; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22361; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 1984(%rax) 22362; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22363; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 1920(%rax) 22364; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22365; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 1856(%rax) 22366; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 22367; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 1792(%rax) 22368; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 1600(%rax) 22369; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 1536(%rax) 22370; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22371; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 1472(%rax) 22372; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22373; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 1408(%rax) 22374; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22375; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 1344(%rax) 22376; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22377; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 1280(%rax) 22378; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 1088(%rax) 22379; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 1024(%rax) 22380; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22381; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 960(%rax) 22382; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22383; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 896(%rax) 22384; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22385; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 832(%rax) 22386; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22387; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 768(%rax) 22388; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 576(%rax) 22389; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 512(%rax) 22390; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22391; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 448(%rax) 22392; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22393; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 384(%rax) 22394; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22395; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 320(%rax) 22396; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22397; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 256(%rax) 22398; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) 22399; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 22400; AVX512DQ-FCP-NEXT: addq $5384, %rsp # imm = 0x1508 22401; AVX512DQ-FCP-NEXT: vzeroupper 22402; AVX512DQ-FCP-NEXT: retq 22403; 22404; AVX512BW-LABEL: store_i64_stride8_vf64: 22405; AVX512BW: # %bb.0: 22406; AVX512BW-NEXT: subq $5384, %rsp # imm = 0x1508 22407; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 22408; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 22409; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 22410; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 22411; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm14 22412; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm1 22413; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm5 22414; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm10 22415; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm2 22416; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 22417; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm12 22418; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm3 22419; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm9 22420; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm15 22421; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm30 22422; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm18 22423; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm11 22424; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm24 22425; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm28 22426; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm22 22427; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm26 22428; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm31 22429; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm16 22430; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm17 22431; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm27 22432; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm13 22433; AVX512BW-NEXT: movb $-64, %r11b 22434; AVX512BW-NEXT: kmovd %r11d, %k1 22435; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] 22436; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 22437; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 22438; AVX512BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 22439; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 22440; AVX512BW-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 22441; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] 22442; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 22443; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 22444; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] 22445; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} 22446; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 22447; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22448; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] 22449; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 22450; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 22451; AVX512BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 22452; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 22453; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 22454; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 22455; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 22456; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 22457; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] 22458; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} 22459; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 22460; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22461; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] 22462; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 22463; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 22464; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 22465; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 22466; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 22467; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 22468; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 22469; AVX512BW-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 22470; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] 22471; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 22472; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22473; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] 22474; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 22475; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 22476; AVX512BW-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 22477; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] 22478; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 22479; AVX512BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 22480; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] 22481; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 22482; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22483; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 22484; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 22485; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 22486; AVX512BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 22487; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 22488; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm8 22489; AVX512BW-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 22490; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] 22491; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 22492; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 22493; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22494; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 22495; AVX512BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 22496; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 22497; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 22498; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 22499; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm8 22500; AVX512BW-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 22501; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] 22502; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 22503; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 22504; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22505; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 22506; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 22507; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 22508; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 22509; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 22510; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22511; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 22512; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 22513; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] 22514; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 22515; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22516; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 22517; AVX512BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 22518; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] 22519; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 22520; AVX512BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 22521; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] 22522; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 22523; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22524; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 22525; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 22526; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 22527; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 22528; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 22529; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 22530; AVX512BW-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 22531; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] 22532; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 22533; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 22534; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22535; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 22536; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 22537; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 22538; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 22539; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 22540; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 22541; AVX512BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 22542; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] 22543; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 22544; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 22545; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22546; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 22547; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 22548; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 22549; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 22550; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 22551; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 22552; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22553; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22554; AVX512BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 22555; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22556; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22557; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] 22558; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 22559; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22560; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 22561; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 22562; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm3 22563; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 22564; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 22565; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 22566; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 22567; AVX512BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 22568; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] 22569; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 22570; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22571; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 22572; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 22573; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm2 22574; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 22575; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 22576; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] 22577; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm9 22578; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm20 22579; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 22580; AVX512BW-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 22581; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm8 22582; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm0 22583; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] 22584; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 22585; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 22586; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22587; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 22588; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 22589; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 22590; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 22591; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 22592; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 22593; AVX512BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 22594; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] 22595; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 22596; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 22597; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22598; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 22599; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 22600; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 22601; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 22602; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 22603; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22604; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 22605; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 22606; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] 22607; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 22608; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22609; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 22610; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 22611; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] 22612; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 22613; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 22614; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] 22615; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 22616; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22617; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm1 22618; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm3 22619; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 22620; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 22621; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 22622; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm5 22623; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 22624; AVX512BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 22625; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] 22626; AVX512BW-NEXT: vmovdqa64 256(%r10), %zmm13 22627; AVX512BW-NEXT: vmovdqa64 256(%rax), %zmm2 22628; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 22629; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 22630; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm18 22631; AVX512BW-NEXT: vmovdqa64 256(%r9), %zmm15 22632; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] 22633; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 22634; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 22635; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22636; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 22637; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 22638; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 22639; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 22640; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 22641; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 22642; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 22643; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] 22644; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 22645; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 22646; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22647; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 22648; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 22649; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 22650; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 22651; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 22652; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7 22653; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 22654; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] 22655; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 22656; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22657; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 22658; AVX512BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 22659; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] 22660; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 22661; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 22662; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] 22663; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 22664; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22665; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm1 22666; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm4 22667; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 22668; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 22669; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 22670; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm7 22671; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 22672; AVX512BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 22673; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] 22674; AVX512BW-NEXT: vmovdqa64 320(%r10), %zmm16 22675; AVX512BW-NEXT: vmovdqa64 320(%rax), %zmm22 22676; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 22677; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 22678; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm29 22679; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm5 22680; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] 22681; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} 22682; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 22683; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22684; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 22685; AVX512BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 22686; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 22687; AVX512BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 22688; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 22689; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 22690; AVX512BW-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 22691; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] 22692; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} 22693; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 22694; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22695; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 22696; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 22697; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 22698; AVX512BW-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 22699; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 22700; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm12 22701; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 22702; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] 22703; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 22704; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22705; AVX512BW-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 22706; AVX512BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 22707; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] 22708; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 22709; AVX512BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 22710; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] 22711; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 22712; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22713; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm3 22714; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm1 22715; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 22716; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 22717; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22718; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 22719; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 22720; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22721; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 22722; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 22723; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22724; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 22725; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22726; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 22727; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm1 22728; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 22729; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 22730; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22731; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 22732; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 22733; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22734; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 22735; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 22736; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22737; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 22738; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22739; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm3 22740; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm1 22741; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 22742; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 22743; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill 22744; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 22745; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 22746; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22747; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 22748; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 22749; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22750; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 22751; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22752; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm3 22753; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm1 22754; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 22755; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 22756; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22757; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 22758; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 22759; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22760; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 22761; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 22762; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22763; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 22764; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22765; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] 22766; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 22767; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 22768; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 22769; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22770; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] 22771; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 22772; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 22773; AVX512BW-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 22774; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22775; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] 22776; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 22777; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 22778; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 22779; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22780; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] 22781; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 22782; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 22783; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22784; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 22785; AVX512BW-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 22786; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22787; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 22788; AVX512BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 22789; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22790; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 22791; AVX512BW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 22792; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22793; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 22794; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22795; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 22796; AVX512BW-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 22797; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22798; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 22799; AVX512BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 22800; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22801; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 22802; AVX512BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 22803; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22804; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 22805; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22806; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 22807; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 22808; AVX512BW-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 22809; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22810; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 22811; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 22812; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22813; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm31 22814; AVX512BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 22815; AVX512BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 22816; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22817; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 22818; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 22819; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22820; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 22821; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22822; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 22823; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 22824; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22825; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 22826; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 22827; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22828; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 22829; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22830; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 22831; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 22832; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22833; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 22834; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22835; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 22836; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 22837; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22838; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm27 22839; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 22840; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 22841; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22842; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 22843; AVX512BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 22844; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22845; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 22846; AVX512BW-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 22847; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22848; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 22849; AVX512BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 22850; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22851; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 22852; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22853; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 22854; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 22855; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 22856; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22857; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 22858; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 22859; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22860; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 22861; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 22862; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 22863; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22864; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 22865; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 22866; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22867; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 22868; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 22869; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22870; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 22871; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 22872; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22873; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 22874; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22875; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 22876; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 22877; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22878; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 22879; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 22880; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22881; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm28 22882; AVX512BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 22883; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 22884; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22885; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 22886; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 22887; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22888; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 22889; AVX512BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 22890; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22891; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 22892; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 22893; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22894; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 22895; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22896; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 22897; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 22898; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22899; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 22900; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 22901; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22902; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm22 22903; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 22904; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 22905; AVX512BW-NEXT: vmovdqa64 384(%r10), %zmm13 22906; AVX512BW-NEXT: vmovdqa64 384(%rax), %zmm0 22907; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 22908; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 22909; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22910; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 22911; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 22912; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22913; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 22914; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 22915; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 22916; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 22917; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm5 22918; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm2 22919; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 22920; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 22921; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] 22922; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 22923; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 22924; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] 22925; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm30 22926; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 22927; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 22928; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm24 22929; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 22930; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm26 22931; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 22932; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm17 22933; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 22934; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] 22935; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] 22936; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 22937; AVX512BW-NEXT: vmovdqa64 448(%r10), %zmm16 22938; AVX512BW-NEXT: vmovdqa64 448(%rax), %zmm1 22939; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 22940; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 22941; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22942; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 22943; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 22944; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22945; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 22946; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 22947; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm3 22948; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm0 22949; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 22950; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] 22951; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 22952; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] 22953; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 22954; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 22955; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 22956; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 22957; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 22958; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 22959; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] 22960; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] 22961; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 22962; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22963; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 22964; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 22965; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} 22966; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 22967; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22968; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22969; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 22970; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 22971; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} 22972; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 22973; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22974; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22975; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 22976; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 22977; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 22978; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22979; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22980; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 22981; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 22982; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 22983; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22984; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22985; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 22986; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 22987; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} 22988; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 22989; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22990; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22991; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 22992; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 22993; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} 22994; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 22995; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 22996; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 22997; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 22998; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 22999; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 23000; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23001; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23002; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 23003; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 23004; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 23005; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23006; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23007; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23008; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} 23009; AVX512BW-NEXT: vmovdqa (%rcx), %ymm0 23010; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 23011; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 23012; AVX512BW-NEXT: vmovdqa (%rsi), %ymm4 23013; AVX512BW-NEXT: vmovdqa (%rdi), %ymm6 23014; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 23015; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 23016; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 23017; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill 23018; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 23019; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 23020; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 23021; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 23022; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 23023; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 23024; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 23025; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23026; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23027; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} 23028; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm0 23029; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm1 23030; AVX512BW-NEXT: vmovdqa 64(%rsi), %ymm2 23031; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm4 23032; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 23033; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] 23034; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] 23035; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 23036; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23037; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 23038; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 23039; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 23040; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 23041; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] 23042; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 23043; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 23044; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23045; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} 23046; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm0 23047; AVX512BW-NEXT: vmovdqa 128(%rdx), %ymm1 23048; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 23049; AVX512BW-NEXT: vmovdqa 128(%rsi), %ymm4 23050; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm6 23051; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 23052; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 23053; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 23054; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 23055; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 23056; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 23057; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 23058; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 23059; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 23060; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 23061; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23062; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} 23063; AVX512BW-NEXT: vmovdqa 192(%rcx), %ymm0 23064; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm1 23065; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 23066; AVX512BW-NEXT: vmovdqa 192(%rsi), %ymm4 23067; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm6 23068; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 23069; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 23070; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 23071; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 23072; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 23073; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 23074; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 23075; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 23076; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 23077; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 23078; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23079; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} 23080; AVX512BW-NEXT: vmovdqa 256(%rcx), %ymm0 23081; AVX512BW-NEXT: vmovdqa 256(%rdx), %ymm1 23082; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 23083; AVX512BW-NEXT: vmovdqa 256(%rsi), %ymm4 23084; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm6 23085; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 23086; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 23087; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 23088; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 23089; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 23090; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 23091; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 23092; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 23093; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 23094; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 23095; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23096; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} 23097; AVX512BW-NEXT: vmovdqa 320(%rcx), %ymm0 23098; AVX512BW-NEXT: vmovdqa 320(%rdx), %ymm1 23099; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 23100; AVX512BW-NEXT: vmovdqa 320(%rsi), %ymm4 23101; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm6 23102; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 23103; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 23104; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 23105; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 23106; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} 23107; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 23108; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 23109; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 23110; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 23111; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} 23112; AVX512BW-NEXT: vmovdqa 384(%rcx), %ymm0 23113; AVX512BW-NEXT: vmovdqa 384(%rdx), %ymm1 23114; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 23115; AVX512BW-NEXT: vmovdqa 384(%rsi), %ymm4 23116; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm6 23117; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 23118; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 23119; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 23120; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} 23121; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 23122; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 23123; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 23124; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 23125; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} 23126; AVX512BW-NEXT: vmovdqa 448(%rcx), %ymm0 23127; AVX512BW-NEXT: vmovdqa 448(%rdx), %ymm1 23128; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 23129; AVX512BW-NEXT: vmovdqa 448(%rsi), %ymm4 23130; AVX512BW-NEXT: vmovdqa 448(%rdi), %ymm5 23131; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] 23132; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] 23133; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 23134; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 23135; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] 23136; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 23137; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} 23138; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 23139; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23140; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 23141; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 23142; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 23143; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 23144; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 23145; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 23146; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] 23147; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 23148; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 23149; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 23150; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 23151; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] 23152; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 23153; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23154; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 23155; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} 23156; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm1 23157; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 23158; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm3 23159; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 23160; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] 23161; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 23162; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 23163; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 23164; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 23165; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] 23166; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 23167; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23168; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 23169; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} 23170; AVX512BW-NEXT: vmovdqa 128(%rsi), %xmm1 23171; AVX512BW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 23172; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm4 23173; AVX512BW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 23174; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] 23175; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 23176; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 23177; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 23178; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} 23179; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] 23180; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 23181; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23182; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 23183; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} 23184; AVX512BW-NEXT: vmovdqa 192(%rsi), %xmm1 23185; AVX512BW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 23186; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm7 23187; AVX512BW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 23188; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] 23189; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 23190; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 23191; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 23192; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} 23193; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] 23194; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 23195; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 23196; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 23197; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} 23198; AVX512BW-NEXT: vmovdqa64 256(%rsi), %xmm16 23199; AVX512BW-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 23200; AVX512BW-NEXT: vmovdqa64 256(%rdi), %xmm17 23201; AVX512BW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 23202; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] 23203; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 23204; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 23205; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 23206; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} 23207; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] 23208; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 23209; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 23210; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 23211; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} 23212; AVX512BW-NEXT: vmovdqa64 320(%rsi), %xmm16 23213; AVX512BW-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 23214; AVX512BW-NEXT: vmovdqa64 320(%rdi), %xmm17 23215; AVX512BW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 23216; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] 23217; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 23218; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 23219; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 23220; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} 23221; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] 23222; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 23223; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 23224; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} 23225; AVX512BW-NEXT: vmovdqa64 384(%rsi), %xmm17 23226; AVX512BW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 23227; AVX512BW-NEXT: vmovdqa64 384(%rdi), %xmm20 23228; AVX512BW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 23229; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] 23230; AVX512BW-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 23231; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 23232; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} 23233; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] 23234; AVX512BW-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 23235; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 23236; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} 23237; AVX512BW-NEXT: vmovdqa64 448(%rsi), %xmm20 23238; AVX512BW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 23239; AVX512BW-NEXT: vmovdqa64 448(%rdi), %xmm26 23240; AVX512BW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 23241; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] 23242; AVX512BW-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 23243; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 23244; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} 23245; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] 23246; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 23247; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 23248; AVX512BW-NEXT: vmovdqa64 %zmm9, 3776(%rax) 23249; AVX512BW-NEXT: vmovdqa64 %zmm10, 3712(%rax) 23250; AVX512BW-NEXT: vmovdqa64 %zmm13, 3264(%rax) 23251; AVX512BW-NEXT: vmovdqa64 %zmm14, 3200(%rax) 23252; AVX512BW-NEXT: vmovdqa64 %zmm27, 2752(%rax) 23253; AVX512BW-NEXT: vmovdqa64 %zmm22, 2688(%rax) 23254; AVX512BW-NEXT: vmovdqa64 %zmm28, 2240(%rax) 23255; AVX512BW-NEXT: vmovdqa64 %zmm15, 2176(%rax) 23256; AVX512BW-NEXT: vmovdqa64 %zmm19, 1728(%rax) 23257; AVX512BW-NEXT: vmovdqa64 %zmm21, 1664(%rax) 23258; AVX512BW-NEXT: vmovdqa64 %zmm25, 1216(%rax) 23259; AVX512BW-NEXT: vmovdqa64 %zmm23, 1152(%rax) 23260; AVX512BW-NEXT: vmovdqa64 %zmm8, 704(%rax) 23261; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23262; AVX512BW-NEXT: vmovaps %zmm8, 640(%rax) 23263; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23264; AVX512BW-NEXT: vmovaps %zmm8, 192(%rax) 23265; AVX512BW-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload 23266; AVX512BW-NEXT: vmovaps %zmm8, 128(%rax) 23267; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23268; AVX512BW-NEXT: vmovaps %zmm8, 4032(%rax) 23269; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23270; AVX512BW-NEXT: vmovaps %zmm8, 3968(%rax) 23271; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23272; AVX512BW-NEXT: vmovaps %zmm8, 3904(%rax) 23273; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23274; AVX512BW-NEXT: vmovaps %zmm8, 3840(%rax) 23275; AVX512BW-NEXT: vmovdqa64 %zmm12, 3648(%rax) 23276; AVX512BW-NEXT: vmovdqa64 %zmm11, 3584(%rax) 23277; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23278; AVX512BW-NEXT: vmovaps %zmm8, 3520(%rax) 23279; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23280; AVX512BW-NEXT: vmovaps %zmm8, 3456(%rax) 23281; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23282; AVX512BW-NEXT: vmovaps %zmm8, 3392(%rax) 23283; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23284; AVX512BW-NEXT: vmovaps %zmm8, 3328(%rax) 23285; AVX512BW-NEXT: vmovdqa64 %zmm17, 3136(%rax) 23286; AVX512BW-NEXT: vmovdqa64 %zmm24, 3072(%rax) 23287; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23288; AVX512BW-NEXT: vmovaps %zmm8, 3008(%rax) 23289; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23290; AVX512BW-NEXT: vmovaps %zmm8, 2944(%rax) 23291; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23292; AVX512BW-NEXT: vmovaps %zmm8, 2880(%rax) 23293; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23294; AVX512BW-NEXT: vmovaps %zmm8, 2816(%rax) 23295; AVX512BW-NEXT: vmovdqa64 %zmm16, 2624(%rax) 23296; AVX512BW-NEXT: vmovdqa64 %zmm18, 2560(%rax) 23297; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23298; AVX512BW-NEXT: vmovaps %zmm8, 2496(%rax) 23299; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23300; AVX512BW-NEXT: vmovaps %zmm8, 2432(%rax) 23301; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23302; AVX512BW-NEXT: vmovaps %zmm8, 2368(%rax) 23303; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23304; AVX512BW-NEXT: vmovaps %zmm8, 2304(%rax) 23305; AVX512BW-NEXT: vmovdqa64 %zmm30, 2112(%rax) 23306; AVX512BW-NEXT: vmovdqa64 %zmm29, 2048(%rax) 23307; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23308; AVX512BW-NEXT: vmovaps %zmm8, 1984(%rax) 23309; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23310; AVX512BW-NEXT: vmovaps %zmm8, 1920(%rax) 23311; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23312; AVX512BW-NEXT: vmovaps %zmm8, 1856(%rax) 23313; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23314; AVX512BW-NEXT: vmovaps %zmm8, 1792(%rax) 23315; AVX512BW-NEXT: vmovdqa64 %zmm1, 1600(%rax) 23316; AVX512BW-NEXT: vmovdqa64 %zmm7, 1536(%rax) 23317; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23318; AVX512BW-NEXT: vmovaps %zmm1, 1472(%rax) 23319; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23320; AVX512BW-NEXT: vmovaps %zmm1, 1408(%rax) 23321; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23322; AVX512BW-NEXT: vmovaps %zmm1, 1344(%rax) 23323; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23324; AVX512BW-NEXT: vmovaps %zmm1, 1280(%rax) 23325; AVX512BW-NEXT: vmovdqa64 %zmm6, 1088(%rax) 23326; AVX512BW-NEXT: vmovdqa64 %zmm4, 1024(%rax) 23327; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23328; AVX512BW-NEXT: vmovaps %zmm1, 960(%rax) 23329; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23330; AVX512BW-NEXT: vmovaps %zmm1, 896(%rax) 23331; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23332; AVX512BW-NEXT: vmovaps %zmm1, 832(%rax) 23333; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23334; AVX512BW-NEXT: vmovaps %zmm1, 768(%rax) 23335; AVX512BW-NEXT: vmovdqa64 %zmm5, 576(%rax) 23336; AVX512BW-NEXT: vmovdqa64 %zmm3, 512(%rax) 23337; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23338; AVX512BW-NEXT: vmovaps %zmm1, 448(%rax) 23339; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23340; AVX512BW-NEXT: vmovaps %zmm1, 384(%rax) 23341; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23342; AVX512BW-NEXT: vmovaps %zmm1, 320(%rax) 23343; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23344; AVX512BW-NEXT: vmovaps %zmm1, 256(%rax) 23345; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) 23346; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) 23347; AVX512BW-NEXT: addq $5384, %rsp # imm = 0x1508 23348; AVX512BW-NEXT: vzeroupper 23349; AVX512BW-NEXT: retq 23350; 23351; AVX512BW-FCP-LABEL: store_i64_stride8_vf64: 23352; AVX512BW-FCP: # %bb.0: 23353; AVX512BW-FCP-NEXT: subq $5384, %rsp # imm = 0x1508 23354; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 23355; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 23356; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 23357; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 23358; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm14 23359; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm1 23360; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 23361; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm10 23362; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm2 23363; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 23364; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm12 23365; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm3 23366; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 23367; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm15 23368; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm30 23369; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm18 23370; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 23371; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm24 23372; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 23373; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 23374; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm26 23375; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm31 23376; AVX512BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 23377; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm17 23378; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 23379; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 23380; AVX512BW-FCP-NEXT: movb $-64, %r11b 23381; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 23382; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] 23383; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 23384; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 23385; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 23386; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 23387; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 23388; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] 23389; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 23390; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 23391; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] 23392; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} 23393; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 23394; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23395; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] 23396; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 23397; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 23398; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 23399; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 23400; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 23401; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 23402; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 23403; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 23404; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] 23405; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} 23406; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 23407; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23408; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] 23409; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 23410; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 23411; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 23412; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 23413; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 23414; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 23415; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 23416; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 23417; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] 23418; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 23419; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23420; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] 23421; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 23422; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 23423; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 23424; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] 23425; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 23426; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 23427; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] 23428; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 23429; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23430; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 23431; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 23432; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 23433; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 23434; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 23435; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 23436; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 23437; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] 23438; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 23439; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 23440; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23441; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 23442; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 23443; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 23444; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 23445; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 23446; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 23447; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 23448; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] 23449; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 23450; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 23451; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23452; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 23453; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 23454; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 23455; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 23456; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 23457; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23458; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 23459; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 23460; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] 23461; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 23462; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23463; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 23464; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 23465; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] 23466; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 23467; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 23468; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] 23469; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 23470; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23471; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 23472; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 23473; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 23474; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 23475; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 23476; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 23477; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 23478; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] 23479; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 23480; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 23481; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23482; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 23483; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 23484; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 23485; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 23486; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 23487; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 23488; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 23489; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] 23490; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 23491; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 23492; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23493; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 23494; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 23495; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 23496; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 23497; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 23498; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 23499; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23500; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23501; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 23502; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23503; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23504; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] 23505; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 23506; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23507; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 23508; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 23509; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm3 23510; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 23511; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 23512; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 23513; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 23514; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 23515; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] 23516; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 23517; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23518; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 23519; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 23520; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 23521; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 23522; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 23523; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] 23524; AVX512BW-FCP-NEXT: vmovdqa64 192(%r10), %zmm9 23525; AVX512BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm20 23526; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 23527; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 23528; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm8 23529; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 23530; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] 23531; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 23532; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 23533; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23534; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 23535; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 23536; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 23537; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 23538; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 23539; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 23540; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 23541; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] 23542; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 23543; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 23544; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23545; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 23546; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 23547; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 23548; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 23549; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 23550; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23551; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 23552; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 23553; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] 23554; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 23555; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23556; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 23557; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 23558; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] 23559; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 23560; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 23561; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] 23562; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 23563; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23564; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm1 23565; AVX512BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm3 23566; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 23567; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 23568; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 23569; AVX512BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm5 23570; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 23571; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 23572; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] 23573; AVX512BW-FCP-NEXT: vmovdqa64 256(%r10), %zmm13 23574; AVX512BW-FCP-NEXT: vmovdqa64 256(%rax), %zmm2 23575; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 23576; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 23577; AVX512BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm18 23578; AVX512BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm15 23579; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] 23580; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 23581; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 23582; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23583; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 23584; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 23585; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 23586; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 23587; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 23588; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 23589; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 23590; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] 23591; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 23592; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 23593; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23594; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 23595; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 23596; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 23597; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 23598; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 23599; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 23600; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 23601; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] 23602; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 23603; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23604; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 23605; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 23606; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] 23607; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 23608; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 23609; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] 23610; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 23611; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23612; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm1 23613; AVX512BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm4 23614; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 23615; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 23616; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 23617; AVX512BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm7 23618; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 23619; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 23620; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] 23621; AVX512BW-FCP-NEXT: vmovdqa64 320(%r10), %zmm16 23622; AVX512BW-FCP-NEXT: vmovdqa64 320(%rax), %zmm22 23623; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 23624; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 23625; AVX512BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm29 23626; AVX512BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm5 23627; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] 23628; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} 23629; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 23630; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23631; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 23632; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 23633; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 23634; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 23635; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 23636; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 23637; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 23638; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] 23639; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} 23640; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 23641; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23642; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 23643; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 23644; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 23645; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 23646; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 23647; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm12 23648; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 23649; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] 23650; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 23651; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23652; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 23653; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 23654; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] 23655; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 23656; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 23657; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] 23658; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 23659; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23660; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm3 23661; AVX512BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm1 23662; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 23663; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 23664; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23665; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 23666; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 23667; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23668; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 23669; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 23670; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23671; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 23672; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23673; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 23674; AVX512BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm1 23675; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 23676; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 23677; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23678; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 23679; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 23680; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23681; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 23682; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 23683; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23684; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 23685; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23686; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm3 23687; AVX512BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm1 23688; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 23689; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 23690; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill 23691; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 23692; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 23693; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23694; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 23695; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 23696; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23697; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 23698; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23699; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 23700; AVX512BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm1 23701; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 23702; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 23703; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23704; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 23705; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 23706; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23707; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 23708; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 23709; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23710; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 23711; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23712; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] 23713; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 23714; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 23715; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 23716; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23717; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] 23718; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 23719; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 23720; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 23721; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23722; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] 23723; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 23724; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 23725; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 23726; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23727; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] 23728; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 23729; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 23730; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23731; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 23732; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 23733; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23734; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 23735; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 23736; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23737; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 23738; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 23739; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23740; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 23741; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23742; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 23743; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 23744; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23745; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 23746; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 23747; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23748; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 23749; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 23750; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23751; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 23752; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23753; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 23754; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 23755; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 23756; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23757; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 23758; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 23759; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23760; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 23761; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 23762; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 23763; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23764; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 23765; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 23766; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23767; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 23768; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23769; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 23770; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 23771; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23772; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 23773; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 23774; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23775; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 23776; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23777; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 23778; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 23779; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23780; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 23781; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23782; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 23783; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 23784; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23785; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 23786; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 23787; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 23788; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23789; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 23790; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 23791; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23792; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 23793; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 23794; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23795; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 23796; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 23797; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23798; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 23799; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23800; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 23801; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 23802; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 23803; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23804; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 23805; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 23806; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23807; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 23808; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 23809; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 23810; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23811; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 23812; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 23813; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23814; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 23815; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 23816; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23817; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 23818; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 23819; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23820; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 23821; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23822; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 23823; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 23824; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23825; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 23826; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 23827; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23828; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm28 23829; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 23830; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 23831; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23832; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 23833; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 23834; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23835; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 23836; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 23837; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23838; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 23839; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 23840; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23841; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 23842; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23843; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 23844; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 23845; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23846; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 23847; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 23848; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23849; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 23850; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 23851; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 23852; AVX512BW-FCP-NEXT: vmovdqa64 384(%r10), %zmm13 23853; AVX512BW-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 23854; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 23855; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 23856; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23857; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 23858; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 23859; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23860; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 23861; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 23862; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 23863; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 23864; AVX512BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm5 23865; AVX512BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm2 23866; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 23867; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 23868; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] 23869; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 23870; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 23871; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] 23872; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 23873; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 23874; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 23875; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 23876; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 23877; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm26 23878; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 23879; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 23880; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 23881; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] 23882; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] 23883; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 23884; AVX512BW-FCP-NEXT: vmovdqa64 448(%r10), %zmm16 23885; AVX512BW-FCP-NEXT: vmovdqa64 448(%rax), %zmm1 23886; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 23887; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 23888; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23889; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 23890; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 23891; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23892; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 23893; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 23894; AVX512BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 23895; AVX512BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 23896; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 23897; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] 23898; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 23899; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] 23900; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 23901; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 23902; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 23903; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 23904; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 23905; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 23906; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] 23907; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] 23908; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 23909; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23910; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 23911; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 23912; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} 23913; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 23914; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23915; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23916; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 23917; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 23918; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} 23919; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 23920; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23921; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23922; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 23923; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 23924; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 23925; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23926; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23927; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 23928; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 23929; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 23930; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23931; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23932; AVX512BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 23933; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 23934; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} 23935; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 23936; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23937; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23938; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 23939; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 23940; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} 23941; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 23942; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23943; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23944; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 23945; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 23946; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 23947; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23948; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23949; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 23950; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 23951; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 23952; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23953; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23954; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 23955; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} 23956; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm0 23957; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 23958; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 23959; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm4 23960; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 23961; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 23962; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 23963; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 23964; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill 23965; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 23966; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 23967; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 23968; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 23969; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 23970; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 23971; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 23972; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23973; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23974; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} 23975; AVX512BW-FCP-NEXT: vmovdqa 64(%rcx), %ymm0 23976; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %ymm1 23977; AVX512BW-FCP-NEXT: vmovdqa 64(%rsi), %ymm2 23978; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 23979; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 23980; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] 23981; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] 23982; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 23983; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 23984; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 23985; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 23986; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 23987; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 23988; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] 23989; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 23990; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 23991; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23992; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} 23993; AVX512BW-FCP-NEXT: vmovdqa 128(%rcx), %ymm0 23994; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %ymm1 23995; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 23996; AVX512BW-FCP-NEXT: vmovdqa 128(%rsi), %ymm4 23997; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 23998; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 23999; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 24000; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 24001; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 24002; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 24003; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 24004; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 24005; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 24006; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 24007; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 24008; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24009; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} 24010; AVX512BW-FCP-NEXT: vmovdqa 192(%rcx), %ymm0 24011; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %ymm1 24012; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 24013; AVX512BW-FCP-NEXT: vmovdqa 192(%rsi), %ymm4 24014; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 24015; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 24016; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 24017; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 24018; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 24019; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 24020; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 24021; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 24022; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 24023; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 24024; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 24025; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24026; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} 24027; AVX512BW-FCP-NEXT: vmovdqa 256(%rcx), %ymm0 24028; AVX512BW-FCP-NEXT: vmovdqa 256(%rdx), %ymm1 24029; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 24030; AVX512BW-FCP-NEXT: vmovdqa 256(%rsi), %ymm4 24031; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 24032; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 24033; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 24034; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 24035; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 24036; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 24037; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 24038; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 24039; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 24040; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 24041; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 24042; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24043; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} 24044; AVX512BW-FCP-NEXT: vmovdqa 320(%rcx), %ymm0 24045; AVX512BW-FCP-NEXT: vmovdqa 320(%rdx), %ymm1 24046; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 24047; AVX512BW-FCP-NEXT: vmovdqa 320(%rsi), %ymm4 24048; AVX512BW-FCP-NEXT: vmovdqa 320(%rdi), %ymm6 24049; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 24050; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 24051; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 24052; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 24053; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} 24054; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 24055; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 24056; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 24057; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 24058; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} 24059; AVX512BW-FCP-NEXT: vmovdqa 384(%rcx), %ymm0 24060; AVX512BW-FCP-NEXT: vmovdqa 384(%rdx), %ymm1 24061; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 24062; AVX512BW-FCP-NEXT: vmovdqa 384(%rsi), %ymm4 24063; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 24064; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 24065; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 24066; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 24067; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} 24068; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 24069; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 24070; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 24071; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 24072; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} 24073; AVX512BW-FCP-NEXT: vmovdqa 448(%rcx), %ymm0 24074; AVX512BW-FCP-NEXT: vmovdqa 448(%rdx), %ymm1 24075; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 24076; AVX512BW-FCP-NEXT: vmovdqa 448(%rsi), %ymm4 24077; AVX512BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm5 24078; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] 24079; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] 24080; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 24081; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 24082; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] 24083; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 24084; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} 24085; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 24086; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24087; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 24088; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 24089; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 24090; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 24091; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 24092; AVX512BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 24093; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] 24094; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 24095; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 24096; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 24097; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 24098; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] 24099; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 24100; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24101; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 24102; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} 24103; AVX512BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm1 24104; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 24105; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 24106; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 24107; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] 24108; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 24109; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 24110; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 24111; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 24112; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] 24113; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 24114; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24115; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 24116; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} 24117; AVX512BW-FCP-NEXT: vmovdqa 128(%rsi), %xmm1 24118; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 24119; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm4 24120; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 24121; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] 24122; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 24123; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 24124; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 24125; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} 24126; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] 24127; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 24128; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24129; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 24130; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} 24131; AVX512BW-FCP-NEXT: vmovdqa 192(%rsi), %xmm1 24132; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 24133; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 24134; AVX512BW-FCP-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 24135; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] 24136; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 24137; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 24138; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 24139; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} 24140; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] 24141; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 24142; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 24143; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 24144; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} 24145; AVX512BW-FCP-NEXT: vmovdqa64 256(%rsi), %xmm16 24146; AVX512BW-FCP-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 24147; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %xmm17 24148; AVX512BW-FCP-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 24149; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] 24150; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 24151; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 24152; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 24153; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} 24154; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] 24155; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 24156; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 24157; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 24158; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} 24159; AVX512BW-FCP-NEXT: vmovdqa64 320(%rsi), %xmm16 24160; AVX512BW-FCP-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 24161; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %xmm17 24162; AVX512BW-FCP-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 24163; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] 24164; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 24165; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 24166; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 24167; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} 24168; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] 24169; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 24170; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 24171; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} 24172; AVX512BW-FCP-NEXT: vmovdqa64 384(%rsi), %xmm17 24173; AVX512BW-FCP-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 24174; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %xmm20 24175; AVX512BW-FCP-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 24176; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] 24177; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 24178; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 24179; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} 24180; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] 24181; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 24182; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 24183; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} 24184; AVX512BW-FCP-NEXT: vmovdqa64 448(%rsi), %xmm20 24185; AVX512BW-FCP-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 24186; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %xmm26 24187; AVX512BW-FCP-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 24188; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] 24189; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 24190; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 24191; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} 24192; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] 24193; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 24194; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 24195; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 3776(%rax) 24196; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 3712(%rax) 24197; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 3264(%rax) 24198; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 3200(%rax) 24199; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 2752(%rax) 24200; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 2688(%rax) 24201; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 2240(%rax) 24202; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 2176(%rax) 24203; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 1728(%rax) 24204; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 1664(%rax) 24205; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 1216(%rax) 24206; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 1152(%rax) 24207; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 704(%rax) 24208; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24209; AVX512BW-FCP-NEXT: vmovaps %zmm8, 640(%rax) 24210; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24211; AVX512BW-FCP-NEXT: vmovaps %zmm8, 192(%rax) 24212; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload 24213; AVX512BW-FCP-NEXT: vmovaps %zmm8, 128(%rax) 24214; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24215; AVX512BW-FCP-NEXT: vmovaps %zmm8, 4032(%rax) 24216; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24217; AVX512BW-FCP-NEXT: vmovaps %zmm8, 3968(%rax) 24218; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24219; AVX512BW-FCP-NEXT: vmovaps %zmm8, 3904(%rax) 24220; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24221; AVX512BW-FCP-NEXT: vmovaps %zmm8, 3840(%rax) 24222; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 3648(%rax) 24223; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 3584(%rax) 24224; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24225; AVX512BW-FCP-NEXT: vmovaps %zmm8, 3520(%rax) 24226; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24227; AVX512BW-FCP-NEXT: vmovaps %zmm8, 3456(%rax) 24228; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24229; AVX512BW-FCP-NEXT: vmovaps %zmm8, 3392(%rax) 24230; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24231; AVX512BW-FCP-NEXT: vmovaps %zmm8, 3328(%rax) 24232; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 3136(%rax) 24233; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 3072(%rax) 24234; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24235; AVX512BW-FCP-NEXT: vmovaps %zmm8, 3008(%rax) 24236; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24237; AVX512BW-FCP-NEXT: vmovaps %zmm8, 2944(%rax) 24238; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24239; AVX512BW-FCP-NEXT: vmovaps %zmm8, 2880(%rax) 24240; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24241; AVX512BW-FCP-NEXT: vmovaps %zmm8, 2816(%rax) 24242; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 2624(%rax) 24243; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 2560(%rax) 24244; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24245; AVX512BW-FCP-NEXT: vmovaps %zmm8, 2496(%rax) 24246; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24247; AVX512BW-FCP-NEXT: vmovaps %zmm8, 2432(%rax) 24248; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24249; AVX512BW-FCP-NEXT: vmovaps %zmm8, 2368(%rax) 24250; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24251; AVX512BW-FCP-NEXT: vmovaps %zmm8, 2304(%rax) 24252; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 2112(%rax) 24253; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 2048(%rax) 24254; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24255; AVX512BW-FCP-NEXT: vmovaps %zmm8, 1984(%rax) 24256; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24257; AVX512BW-FCP-NEXT: vmovaps %zmm8, 1920(%rax) 24258; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24259; AVX512BW-FCP-NEXT: vmovaps %zmm8, 1856(%rax) 24260; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24261; AVX512BW-FCP-NEXT: vmovaps %zmm8, 1792(%rax) 24262; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 1600(%rax) 24263; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 1536(%rax) 24264; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24265; AVX512BW-FCP-NEXT: vmovaps %zmm1, 1472(%rax) 24266; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24267; AVX512BW-FCP-NEXT: vmovaps %zmm1, 1408(%rax) 24268; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24269; AVX512BW-FCP-NEXT: vmovaps %zmm1, 1344(%rax) 24270; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24271; AVX512BW-FCP-NEXT: vmovaps %zmm1, 1280(%rax) 24272; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 1088(%rax) 24273; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 1024(%rax) 24274; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24275; AVX512BW-FCP-NEXT: vmovaps %zmm1, 960(%rax) 24276; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24277; AVX512BW-FCP-NEXT: vmovaps %zmm1, 896(%rax) 24278; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24279; AVX512BW-FCP-NEXT: vmovaps %zmm1, 832(%rax) 24280; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24281; AVX512BW-FCP-NEXT: vmovaps %zmm1, 768(%rax) 24282; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 576(%rax) 24283; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 512(%rax) 24284; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24285; AVX512BW-FCP-NEXT: vmovaps %zmm1, 448(%rax) 24286; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24287; AVX512BW-FCP-NEXT: vmovaps %zmm1, 384(%rax) 24288; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24289; AVX512BW-FCP-NEXT: vmovaps %zmm1, 320(%rax) 24290; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24291; AVX512BW-FCP-NEXT: vmovaps %zmm1, 256(%rax) 24292; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) 24293; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 24294; AVX512BW-FCP-NEXT: addq $5384, %rsp # imm = 0x1508 24295; AVX512BW-FCP-NEXT: vzeroupper 24296; AVX512BW-FCP-NEXT: retq 24297; 24298; AVX512DQ-BW-LABEL: store_i64_stride8_vf64: 24299; AVX512DQ-BW: # %bb.0: 24300; AVX512DQ-BW-NEXT: subq $5384, %rsp # imm = 0x1508 24301; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 24302; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 24303; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm0 24304; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm4 24305; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm14 24306; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm1 24307; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm5 24308; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm10 24309; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm2 24310; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm7 24311; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm12 24312; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm3 24313; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm9 24314; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm15 24315; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm30 24316; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm18 24317; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm11 24318; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm24 24319; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm28 24320; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm22 24321; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm26 24322; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm31 24323; AVX512DQ-BW-NEXT: vmovdqa64 128(%r10), %zmm16 24324; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm17 24325; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm27 24326; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm13 24327; AVX512DQ-BW-NEXT: movb $-64, %r11b 24328; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 24329; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] 24330; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 24331; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm8 24332; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 24333; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm6 24334; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 24335; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] 24336; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm8 24337; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 24338; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] 24339; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} 24340; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 24341; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24342; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] 24343; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 24344; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm6 24345; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 24346; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm8 24347; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 24348; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 24349; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm8 24350; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 24351; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] 24352; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} 24353; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 24354; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24355; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] 24356; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 24357; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm6 24358; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 24359; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm8 24360; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 24361; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 24362; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm8 24363; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 24364; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] 24365; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 24366; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24367; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] 24368; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 24369; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 24370; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 24371; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] 24372; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm8 24373; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 24374; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] 24375; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 24376; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24377; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 24378; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 24379; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 24380; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 24381; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 24382; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm8 24383; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 24384; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] 24385; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 24386; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 24387; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24388; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 24389; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 24390; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 24391; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 24392; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 24393; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm8 24394; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 24395; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] 24396; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 24397; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 24398; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24399; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 24400; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 24401; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 24402; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 24403; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 24404; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24405; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm8 24406; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 24407; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] 24408; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 24409; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24410; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 24411; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 24412; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] 24413; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm5 24414; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 24415; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] 24416; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 24417; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24418; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 24419; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 24420; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 24421; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 24422; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 24423; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm5 24424; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 24425; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] 24426; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 24427; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 24428; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24429; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 24430; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 24431; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 24432; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 24433; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 24434; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm5 24435; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 24436; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] 24437; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 24438; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 24439; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24440; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 24441; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 24442; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 24443; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 24444; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 24445; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 24446; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24447; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24448; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 24449; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24450; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24451; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] 24452; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 24453; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24454; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4 24455; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 24456; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm3 24457; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 24458; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm1 24459; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 24460; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm2 24461; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 24462; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] 24463; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 24464; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24465; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 24466; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 24467; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm2 24468; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 24469; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 24470; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] 24471; AVX512DQ-BW-NEXT: vmovdqa64 192(%r10), %zmm9 24472; AVX512DQ-BW-NEXT: vmovdqa64 192(%rax), %zmm20 24473; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm6 24474; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 24475; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm8 24476; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm0 24477; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] 24478; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 24479; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 24480; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24481; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 24482; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 24483; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 24484; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 24485; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 24486; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm6 24487; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 24488; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] 24489; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 24490; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 24491; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24492; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 24493; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 24494; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 24495; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 24496; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 24497; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24498; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 24499; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 24500; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] 24501; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 24502; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24503; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 24504; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 24505; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] 24506; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 24507; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 24508; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] 24509; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 24510; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24511; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdx), %zmm1 24512; AVX512DQ-BW-NEXT: vmovdqa64 256(%rcx), %zmm3 24513; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 24514; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 24515; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm4 24516; AVX512DQ-BW-NEXT: vmovdqa64 256(%rsi), %zmm5 24517; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 24518; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 24519; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] 24520; AVX512DQ-BW-NEXT: vmovdqa64 256(%r10), %zmm13 24521; AVX512DQ-BW-NEXT: vmovdqa64 256(%rax), %zmm2 24522; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm7 24523; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 24524; AVX512DQ-BW-NEXT: vmovdqa64 256(%r8), %zmm18 24525; AVX512DQ-BW-NEXT: vmovdqa64 256(%r9), %zmm15 24526; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] 24527; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 24528; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 24529; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24530; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 24531; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 24532; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 24533; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 24534; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 24535; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm7 24536; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 24537; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] 24538; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 24539; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 24540; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24541; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 24542; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 24543; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 24544; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 24545; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 24546; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm7 24547; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 24548; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] 24549; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 24550; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24551; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 24552; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 24553; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] 24554; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm3 24555; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 24556; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] 24557; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 24558; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24559; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdx), %zmm1 24560; AVX512DQ-BW-NEXT: vmovdqa64 320(%rcx), %zmm4 24561; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 24562; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 24563; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm6 24564; AVX512DQ-BW-NEXT: vmovdqa64 320(%rsi), %zmm7 24565; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5 24566; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 24567; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] 24568; AVX512DQ-BW-NEXT: vmovdqa64 320(%r10), %zmm16 24569; AVX512DQ-BW-NEXT: vmovdqa64 320(%rax), %zmm22 24570; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm12 24571; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 24572; AVX512DQ-BW-NEXT: vmovdqa64 320(%r8), %zmm29 24573; AVX512DQ-BW-NEXT: vmovdqa64 320(%r9), %zmm5 24574; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] 24575; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} 24576; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 24577; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24578; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 24579; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 24580; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 24581; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 24582; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 24583; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm12 24584; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 24585; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] 24586; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} 24587; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 24588; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24589; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 24590; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 24591; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 24592; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 24593; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 24594; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm12 24595; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 24596; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] 24597; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 24598; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24599; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 24600; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 24601; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] 24602; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm4 24603; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 24604; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] 24605; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 24606; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24607; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdx), %zmm3 24608; AVX512DQ-BW-NEXT: vmovdqa64 384(%rcx), %zmm1 24609; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 24610; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 24611; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24612; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 24613; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 24614; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24615; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 24616; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 24617; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24618; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 24619; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24620; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm3 24621; AVX512DQ-BW-NEXT: vmovdqa64 384(%rsi), %zmm1 24622; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 24623; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 24624; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24625; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 24626; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 24627; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24628; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 24629; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 24630; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24631; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 24632; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24633; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdx), %zmm3 24634; AVX512DQ-BW-NEXT: vmovdqa64 448(%rcx), %zmm1 24635; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 24636; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 24637; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill 24638; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 24639; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 24640; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24641; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 24642; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 24643; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24644; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 24645; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24646; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm3 24647; AVX512DQ-BW-NEXT: vmovdqa64 448(%rsi), %zmm1 24648; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 24649; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 24650; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24651; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 24652; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 24653; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24654; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 24655; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 24656; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24657; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 24658; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24659; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] 24660; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 24661; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm1 24662; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 24663; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24664; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] 24665; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 24666; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm1 24667; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 24668; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24669; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] 24670; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 24671; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm1 24672; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 24673; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24674; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] 24675; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 24676; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 24677; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24678; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm1 24679; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 24680; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24681; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm1 24682; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 24683; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24684; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm1 24685; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 24686; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24687; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 24688; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24689; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm1 24690; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 24691; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24692; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm1 24693; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 24694; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24695; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm1 24696; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 24697; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24698; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 24699; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24700; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 24701; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 24702; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 24703; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24704; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 24705; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 24706; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24707; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm31 24708; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 24709; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 24710; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24711; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 24712; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm3 24713; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24714; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 24715; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24716; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm3 24717; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 24718; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24719; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm3 24720; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 24721; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24722; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 24723; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24724; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 24725; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 24726; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24727; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 24728; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24729; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 24730; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 24731; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24732; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm27 24733; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 24734; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 24735; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24736; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 24737; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 24738; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24739; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 24740; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 24741; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24742; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 24743; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 24744; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24745; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 24746; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24747; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 24748; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 24749; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 24750; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24751; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 24752; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 24753; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24754; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 24755; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 24756; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 24757; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24758; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm0 24759; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 24760; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24761; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm0 24762; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 24763; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24764; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm0 24765; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 24766; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24767; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 24768; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24769; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 24770; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 24771; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24772; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 24773; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 24774; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24775; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm28 24776; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 24777; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 24778; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24779; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 24780; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 24781; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24782; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 24783; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 24784; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24785; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 24786; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 24787; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24788; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 24789; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24790; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 24791; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 24792; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24793; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 24794; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 24795; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24796; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm22 24797; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 24798; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 24799; AVX512DQ-BW-NEXT: vmovdqa64 384(%r10), %zmm13 24800; AVX512DQ-BW-NEXT: vmovdqa64 384(%rax), %zmm0 24801; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 24802; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 24803; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24804; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 24805; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 24806; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24807; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm14 24808; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 24809; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 24810; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 24811; AVX512DQ-BW-NEXT: vmovdqa64 384(%r8), %zmm5 24812; AVX512DQ-BW-NEXT: vmovdqa64 384(%r9), %zmm2 24813; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8 24814; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 24815; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] 24816; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 24817; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 24818; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] 24819; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm30 24820; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 24821; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 24822; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm24 24823; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 24824; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm26 24825; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 24826; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm17 24827; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 24828; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] 24829; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] 24830; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 24831; AVX512DQ-BW-NEXT: vmovdqa64 448(%r10), %zmm16 24832; AVX512DQ-BW-NEXT: vmovdqa64 448(%rax), %zmm1 24833; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 24834; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 24835; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24836; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 24837; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 24838; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24839; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 24840; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 24841; AVX512DQ-BW-NEXT: vmovdqa64 448(%r8), %zmm3 24842; AVX512DQ-BW-NEXT: vmovdqa64 448(%r9), %zmm0 24843; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 24844; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] 24845; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 24846; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] 24847; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm18 24848; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 24849; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 24850; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 24851; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 24852; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 24853; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] 24854; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] 24855; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 24856; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24857; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 24858; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 24859; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} 24860; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 24861; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24862; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24863; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 24864; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 24865; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} 24866; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 24867; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24868; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24869; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 24870; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 24871; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 24872; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24873; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24874; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 24875; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 24876; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 24877; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24878; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24879; AVX512DQ-BW-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 24880; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 24881; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} 24882; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 24883; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24884; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24885; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 24886; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 24887; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} 24888; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 24889; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24890; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24891; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 24892; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 24893; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 24894; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24895; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24896; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 24897; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 24898; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 24899; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24900; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24901; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 24902; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} 24903; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm0 24904; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 24905; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 24906; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm4 24907; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm6 24908; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 24909; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 24910; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 24911; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill 24912; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 24913; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 24914; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 24915; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 24916; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 24917; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 24918; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 24919; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24920; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24921; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} 24922; AVX512DQ-BW-NEXT: vmovdqa 64(%rcx), %ymm0 24923; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %ymm1 24924; AVX512DQ-BW-NEXT: vmovdqa 64(%rsi), %ymm2 24925; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm4 24926; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 24927; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] 24928; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] 24929; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 24930; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 24931; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 24932; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 24933; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 24934; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 24935; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] 24936; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 24937; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 24938; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24939; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} 24940; AVX512DQ-BW-NEXT: vmovdqa 128(%rcx), %ymm0 24941; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %ymm1 24942; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 24943; AVX512DQ-BW-NEXT: vmovdqa 128(%rsi), %ymm4 24944; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm6 24945; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 24946; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 24947; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 24948; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 24949; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 24950; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 24951; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 24952; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 24953; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 24954; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 24955; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24956; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} 24957; AVX512DQ-BW-NEXT: vmovdqa 192(%rcx), %ymm0 24958; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %ymm1 24959; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 24960; AVX512DQ-BW-NEXT: vmovdqa 192(%rsi), %ymm4 24961; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm6 24962; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 24963; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 24964; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 24965; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 24966; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 24967; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 24968; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 24969; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 24970; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 24971; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 24972; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24973; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} 24974; AVX512DQ-BW-NEXT: vmovdqa 256(%rcx), %ymm0 24975; AVX512DQ-BW-NEXT: vmovdqa 256(%rdx), %ymm1 24976; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 24977; AVX512DQ-BW-NEXT: vmovdqa 256(%rsi), %ymm4 24978; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm6 24979; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 24980; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 24981; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 24982; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 24983; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 24984; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 24985; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 24986; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 24987; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 24988; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 24989; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 24990; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} 24991; AVX512DQ-BW-NEXT: vmovdqa 320(%rcx), %ymm0 24992; AVX512DQ-BW-NEXT: vmovdqa 320(%rdx), %ymm1 24993; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 24994; AVX512DQ-BW-NEXT: vmovdqa 320(%rsi), %ymm4 24995; AVX512DQ-BW-NEXT: vmovdqa 320(%rdi), %ymm6 24996; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 24997; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 24998; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 24999; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 25000; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} 25001; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 25002; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 25003; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 25004; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 25005; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} 25006; AVX512DQ-BW-NEXT: vmovdqa 384(%rcx), %ymm0 25007; AVX512DQ-BW-NEXT: vmovdqa 384(%rdx), %ymm1 25008; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 25009; AVX512DQ-BW-NEXT: vmovdqa 384(%rsi), %ymm4 25010; AVX512DQ-BW-NEXT: vmovdqa 384(%rdi), %ymm6 25011; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 25012; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 25013; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 25014; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} 25015; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 25016; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 25017; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 25018; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 25019; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} 25020; AVX512DQ-BW-NEXT: vmovdqa 448(%rcx), %ymm0 25021; AVX512DQ-BW-NEXT: vmovdqa 448(%rdx), %ymm1 25022; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 25023; AVX512DQ-BW-NEXT: vmovdqa 448(%rsi), %ymm4 25024; AVX512DQ-BW-NEXT: vmovdqa 448(%rdi), %ymm5 25025; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] 25026; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] 25027; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 25028; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 25029; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] 25030; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 25031; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} 25032; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 25033; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 25034; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 25035; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 25036; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm0 25037; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 25038; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 25039; AVX512DQ-BW-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 25040; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] 25041; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 25042; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 25043; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 25044; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 25045; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] 25046; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 25047; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25048; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 25049; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} 25050; AVX512DQ-BW-NEXT: vmovdqa 64(%rsi), %xmm1 25051; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 25052; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm3 25053; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 25054; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] 25055; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 25056; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 25057; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 25058; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 25059; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] 25060; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 25061; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25062; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 25063; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} 25064; AVX512DQ-BW-NEXT: vmovdqa 128(%rsi), %xmm1 25065; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 25066; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm4 25067; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 25068; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] 25069; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 25070; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 25071; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 25072; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} 25073; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] 25074; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 25075; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25076; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 25077; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} 25078; AVX512DQ-BW-NEXT: vmovdqa 192(%rsi), %xmm1 25079; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 25080; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm7 25081; AVX512DQ-BW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 25082; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] 25083; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 25084; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 25085; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 25086; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} 25087; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] 25088; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 25089; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 25090; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 25091; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} 25092; AVX512DQ-BW-NEXT: vmovdqa64 256(%rsi), %xmm16 25093; AVX512DQ-BW-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 25094; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %xmm17 25095; AVX512DQ-BW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 25096; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] 25097; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 25098; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 25099; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 25100; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} 25101; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] 25102; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 25103; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 25104; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 25105; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} 25106; AVX512DQ-BW-NEXT: vmovdqa64 320(%rsi), %xmm16 25107; AVX512DQ-BW-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 25108; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %xmm17 25109; AVX512DQ-BW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 25110; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] 25111; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 25112; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 25113; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 25114; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} 25115; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] 25116; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 25117; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 25118; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} 25119; AVX512DQ-BW-NEXT: vmovdqa64 384(%rsi), %xmm17 25120; AVX512DQ-BW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 25121; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %xmm20 25122; AVX512DQ-BW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 25123; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] 25124; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 25125; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 25126; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} 25127; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] 25128; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 25129; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 25130; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} 25131; AVX512DQ-BW-NEXT: vmovdqa64 448(%rsi), %xmm20 25132; AVX512DQ-BW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 25133; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %xmm26 25134; AVX512DQ-BW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 25135; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] 25136; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 25137; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 25138; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} 25139; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] 25140; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 25141; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 25142; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 3776(%rax) 25143; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 3712(%rax) 25144; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 3264(%rax) 25145; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 3200(%rax) 25146; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 2752(%rax) 25147; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 2688(%rax) 25148; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 2240(%rax) 25149; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 2176(%rax) 25150; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 1728(%rax) 25151; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 1664(%rax) 25152; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 1216(%rax) 25153; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 1152(%rax) 25154; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 704(%rax) 25155; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25156; AVX512DQ-BW-NEXT: vmovaps %zmm8, 640(%rax) 25157; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25158; AVX512DQ-BW-NEXT: vmovaps %zmm8, 192(%rax) 25159; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload 25160; AVX512DQ-BW-NEXT: vmovaps %zmm8, 128(%rax) 25161; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25162; AVX512DQ-BW-NEXT: vmovaps %zmm8, 4032(%rax) 25163; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25164; AVX512DQ-BW-NEXT: vmovaps %zmm8, 3968(%rax) 25165; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25166; AVX512DQ-BW-NEXT: vmovaps %zmm8, 3904(%rax) 25167; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25168; AVX512DQ-BW-NEXT: vmovaps %zmm8, 3840(%rax) 25169; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 3648(%rax) 25170; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 3584(%rax) 25171; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25172; AVX512DQ-BW-NEXT: vmovaps %zmm8, 3520(%rax) 25173; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25174; AVX512DQ-BW-NEXT: vmovaps %zmm8, 3456(%rax) 25175; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25176; AVX512DQ-BW-NEXT: vmovaps %zmm8, 3392(%rax) 25177; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25178; AVX512DQ-BW-NEXT: vmovaps %zmm8, 3328(%rax) 25179; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 3136(%rax) 25180; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 3072(%rax) 25181; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25182; AVX512DQ-BW-NEXT: vmovaps %zmm8, 3008(%rax) 25183; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25184; AVX512DQ-BW-NEXT: vmovaps %zmm8, 2944(%rax) 25185; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25186; AVX512DQ-BW-NEXT: vmovaps %zmm8, 2880(%rax) 25187; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25188; AVX512DQ-BW-NEXT: vmovaps %zmm8, 2816(%rax) 25189; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 2624(%rax) 25190; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 2560(%rax) 25191; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25192; AVX512DQ-BW-NEXT: vmovaps %zmm8, 2496(%rax) 25193; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25194; AVX512DQ-BW-NEXT: vmovaps %zmm8, 2432(%rax) 25195; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25196; AVX512DQ-BW-NEXT: vmovaps %zmm8, 2368(%rax) 25197; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25198; AVX512DQ-BW-NEXT: vmovaps %zmm8, 2304(%rax) 25199; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 2112(%rax) 25200; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 2048(%rax) 25201; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25202; AVX512DQ-BW-NEXT: vmovaps %zmm8, 1984(%rax) 25203; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25204; AVX512DQ-BW-NEXT: vmovaps %zmm8, 1920(%rax) 25205; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25206; AVX512DQ-BW-NEXT: vmovaps %zmm8, 1856(%rax) 25207; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25208; AVX512DQ-BW-NEXT: vmovaps %zmm8, 1792(%rax) 25209; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 1600(%rax) 25210; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 1536(%rax) 25211; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25212; AVX512DQ-BW-NEXT: vmovaps %zmm1, 1472(%rax) 25213; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25214; AVX512DQ-BW-NEXT: vmovaps %zmm1, 1408(%rax) 25215; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25216; AVX512DQ-BW-NEXT: vmovaps %zmm1, 1344(%rax) 25217; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25218; AVX512DQ-BW-NEXT: vmovaps %zmm1, 1280(%rax) 25219; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 1088(%rax) 25220; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 1024(%rax) 25221; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25222; AVX512DQ-BW-NEXT: vmovaps %zmm1, 960(%rax) 25223; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25224; AVX512DQ-BW-NEXT: vmovaps %zmm1, 896(%rax) 25225; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25226; AVX512DQ-BW-NEXT: vmovaps %zmm1, 832(%rax) 25227; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25228; AVX512DQ-BW-NEXT: vmovaps %zmm1, 768(%rax) 25229; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 576(%rax) 25230; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 512(%rax) 25231; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25232; AVX512DQ-BW-NEXT: vmovaps %zmm1, 448(%rax) 25233; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25234; AVX512DQ-BW-NEXT: vmovaps %zmm1, 384(%rax) 25235; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25236; AVX512DQ-BW-NEXT: vmovaps %zmm1, 320(%rax) 25237; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25238; AVX512DQ-BW-NEXT: vmovaps %zmm1, 256(%rax) 25239; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax) 25240; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) 25241; AVX512DQ-BW-NEXT: addq $5384, %rsp # imm = 0x1508 25242; AVX512DQ-BW-NEXT: vzeroupper 25243; AVX512DQ-BW-NEXT: retq 25244; 25245; AVX512DQ-BW-FCP-LABEL: store_i64_stride8_vf64: 25246; AVX512DQ-BW-FCP: # %bb.0: 25247; AVX512DQ-BW-FCP-NEXT: subq $5384, %rsp # imm = 0x1508 25248; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 25249; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 25250; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 25251; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 25252; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm14 25253; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm1 25254; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 25255; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm10 25256; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm2 25257; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 25258; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm12 25259; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm3 25260; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 25261; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm15 25262; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm30 25263; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm18 25264; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 25265; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm24 25266; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 25267; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 25268; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm26 25269; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm31 25270; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 25271; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm17 25272; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 25273; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 25274; AVX512DQ-BW-FCP-NEXT: movb $-64, %r11b 25275; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 25276; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] 25277; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 25278; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 25279; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 25280; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 25281; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 25282; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] 25283; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 25284; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 25285; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] 25286; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} 25287; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 25288; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25289; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] 25290; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 25291; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 25292; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 25293; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 25294; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 25295; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 25296; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 25297; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 25298; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] 25299; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} 25300; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 25301; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25302; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] 25303; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 25304; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 25305; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 25306; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 25307; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 25308; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 25309; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 25310; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 25311; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] 25312; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 25313; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25314; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] 25315; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 25316; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 25317; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 25318; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] 25319; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 25320; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 25321; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] 25322; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 25323; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25324; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 25325; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 25326; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 25327; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 25328; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 25329; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 25330; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 25331; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] 25332; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 25333; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 25334; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25335; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 25336; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 25337; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 25338; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 25339; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 25340; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 25341; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 25342; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] 25343; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 25344; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 25345; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25346; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 25347; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 25348; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 25349; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 25350; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 25351; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25352; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 25353; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 25354; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] 25355; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 25356; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25357; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 25358; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 25359; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] 25360; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 25361; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 25362; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] 25363; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 25364; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25365; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 25366; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 25367; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 25368; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 25369; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 25370; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 25371; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 25372; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] 25373; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 25374; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 25375; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25376; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 25377; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 25378; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 25379; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 25380; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 25381; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 25382; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 25383; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] 25384; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 25385; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 25386; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25387; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 25388; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 25389; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 25390; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 25391; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 25392; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 25393; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25394; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25395; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 25396; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25397; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25398; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] 25399; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 25400; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25401; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 25402; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 25403; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm3 25404; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 25405; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 25406; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 25407; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 25408; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 25409; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] 25410; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 25411; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25412; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 25413; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 25414; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 25415; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 25416; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 25417; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] 25418; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r10), %zmm9 25419; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm20 25420; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 25421; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 25422; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm8 25423; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 25424; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] 25425; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 25426; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 25427; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25428; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 25429; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 25430; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 25431; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 25432; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 25433; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 25434; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 25435; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] 25436; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 25437; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 25438; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25439; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 25440; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 25441; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 25442; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 25443; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 25444; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25445; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 25446; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 25447; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] 25448; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 25449; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25450; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 25451; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 25452; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] 25453; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 25454; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 25455; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] 25456; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 25457; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25458; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm1 25459; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm3 25460; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 25461; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 25462; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 25463; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm5 25464; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 25465; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 25466; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] 25467; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r10), %zmm13 25468; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rax), %zmm2 25469; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 25470; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 25471; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm18 25472; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm15 25473; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] 25474; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 25475; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 25476; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25477; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 25478; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 25479; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 25480; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 25481; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 25482; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 25483; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 25484; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] 25485; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 25486; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 25487; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25488; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 25489; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 25490; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 25491; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 25492; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 25493; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 25494; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 25495; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] 25496; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 25497; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25498; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 25499; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 25500; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] 25501; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 25502; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 25503; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] 25504; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 25505; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25506; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm1 25507; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm4 25508; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 25509; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 25510; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 25511; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm7 25512; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 25513; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 25514; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] 25515; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r10), %zmm16 25516; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rax), %zmm22 25517; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 25518; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 25519; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm29 25520; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm5 25521; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] 25522; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} 25523; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 25524; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25525; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 25526; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 25527; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 25528; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 25529; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 25530; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 25531; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 25532; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] 25533; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} 25534; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 25535; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25536; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 25537; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 25538; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 25539; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 25540; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 25541; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm12 25542; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 25543; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] 25544; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 25545; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25546; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 25547; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 25548; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] 25549; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 25550; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 25551; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] 25552; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 25553; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25554; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm3 25555; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm1 25556; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 25557; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 25558; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25559; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 25560; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 25561; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25562; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 25563; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 25564; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25565; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 25566; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25567; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 25568; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm1 25569; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 25570; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 25571; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25572; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 25573; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 25574; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25575; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 25576; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 25577; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25578; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 25579; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25580; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm3 25581; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm1 25582; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 25583; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 25584; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill 25585; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 25586; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 25587; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25588; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 25589; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 25590; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25591; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 25592; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25593; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 25594; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm1 25595; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 25596; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 25597; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25598; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 25599; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 25600; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25601; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 25602; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 25603; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25604; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 25605; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25606; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] 25607; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 25608; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 25609; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 25610; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25611; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] 25612; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 25613; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 25614; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 25615; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25616; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] 25617; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 25618; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 25619; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 25620; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25621; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] 25622; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 25623; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 25624; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25625; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 25626; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 25627; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25628; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 25629; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 25630; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25631; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 25632; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 25633; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25634; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 25635; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25636; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 25637; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 25638; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25639; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 25640; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 25641; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25642; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 25643; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 25644; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25645; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 25646; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25647; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 25648; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 25649; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 25650; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25651; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 25652; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 25653; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25654; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 25655; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 25656; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 25657; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25658; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 25659; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 25660; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25661; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 25662; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25663; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 25664; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 25665; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25666; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 25667; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 25668; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25669; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 25670; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25671; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 25672; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 25673; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25674; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 25675; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25676; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 25677; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 25678; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25679; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 25680; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 25681; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 25682; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25683; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 25684; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 25685; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25686; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 25687; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 25688; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25689; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 25690; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 25691; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25692; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 25693; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25694; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25695; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 25696; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 25697; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25698; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 25699; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 25700; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25701; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 25702; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 25703; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 25704; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25705; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 25706; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 25707; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25708; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 25709; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 25710; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25711; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 25712; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 25713; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25714; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 25715; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25716; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 25717; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 25718; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25719; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 25720; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 25721; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25722; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm28 25723; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 25724; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 25725; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25726; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 25727; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 25728; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25729; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 25730; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 25731; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25732; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 25733; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 25734; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25735; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 25736; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25737; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 25738; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 25739; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25740; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 25741; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 25742; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25743; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 25744; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 25745; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 25746; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r10), %zmm13 25747; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 25748; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 25749; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 25750; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25751; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 25752; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 25753; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25754; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 25755; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 25756; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 25757; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 25758; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm5 25759; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm2 25760; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 25761; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 25762; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] 25763; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 25764; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 25765; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] 25766; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 25767; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 25768; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 25769; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 25770; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 25771; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm26 25772; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 25773; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 25774; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 25775; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] 25776; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] 25777; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 25778; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r10), %zmm16 25779; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rax), %zmm1 25780; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 25781; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 25782; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25783; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 25784; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 25785; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25786; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 25787; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 25788; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 25789; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 25790; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 25791; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] 25792; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 25793; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] 25794; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 25795; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 25796; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 25797; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 25798; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 25799; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 25800; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] 25801; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] 25802; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 25803; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 25804; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 25805; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 25806; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} 25807; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 25808; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25809; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 25810; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 25811; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 25812; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} 25813; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 25814; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25815; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 25816; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 25817; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 25818; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 25819; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25820; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 25821; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 25822; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 25823; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 25824; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25825; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 25826; AVX512DQ-BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 25827; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 25828; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} 25829; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 25830; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25831; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 25832; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 25833; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 25834; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} 25835; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 25836; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25837; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 25838; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 25839; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 25840; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 25841; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25842; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 25843; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 25844; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 25845; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 25846; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25847; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 25848; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 25849; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} 25850; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm0 25851; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 25852; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 25853; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm4 25854; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 25855; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 25856; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 25857; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 25858; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill 25859; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 25860; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 25861; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 25862; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 25863; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 25864; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 25865; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 25866; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25867; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 25868; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} 25869; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rcx), %ymm0 25870; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %ymm1 25871; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rsi), %ymm2 25872; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 25873; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 25874; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] 25875; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] 25876; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 25877; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 25878; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 25879; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 25880; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} 25881; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 25882; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] 25883; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 25884; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 25885; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 25886; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} 25887; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rcx), %ymm0 25888; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdx), %ymm1 25889; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 25890; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rsi), %ymm4 25891; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 25892; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 25893; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 25894; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 25895; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 25896; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 25897; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 25898; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 25899; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 25900; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 25901; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 25902; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 25903; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} 25904; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rcx), %ymm0 25905; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %ymm1 25906; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 25907; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rsi), %ymm4 25908; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 25909; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 25910; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 25911; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 25912; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 25913; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 25914; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 25915; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 25916; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 25917; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 25918; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 25919; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 25920; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} 25921; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rcx), %ymm0 25922; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdx), %ymm1 25923; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 25924; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rsi), %ymm4 25925; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 25926; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 25927; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 25928; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 25929; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 25930; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 25931; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} 25932; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 25933; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 25934; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 25935; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 25936; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 25937; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} 25938; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rcx), %ymm0 25939; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rdx), %ymm1 25940; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 25941; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rsi), %ymm4 25942; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rdi), %ymm6 25943; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 25944; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 25945; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 25946; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 25947; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} 25948; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 25949; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 25950; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 25951; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 25952; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} 25953; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rcx), %ymm0 25954; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdx), %ymm1 25955; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 25956; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rsi), %ymm4 25957; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 25958; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 25959; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 25960; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 25961; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} 25962; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 25963; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] 25964; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 25965; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 25966; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} 25967; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rcx), %ymm0 25968; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdx), %ymm1 25969; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 25970; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rsi), %ymm4 25971; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm5 25972; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] 25973; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] 25974; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 25975; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 25976; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] 25977; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 25978; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} 25979; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 25980; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 25981; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 25982; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 25983; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 25984; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 25985; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 25986; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 25987; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] 25988; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 25989; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 25990; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 25991; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} 25992; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] 25993; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 25994; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 25995; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 25996; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} 25997; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm1 25998; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 25999; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 26000; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 26001; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] 26002; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 26003; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 26004; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 26005; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} 26006; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] 26007; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 26008; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 26009; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 26010; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} 26011; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rsi), %xmm1 26012; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 26013; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm4 26014; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 26015; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] 26016; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 26017; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 26018; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 26019; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} 26020; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] 26021; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 26022; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 26023; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 26024; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} 26025; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rsi), %xmm1 26026; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 26027; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 26028; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 26029; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] 26030; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 26031; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 26032; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 26033; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} 26034; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] 26035; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 26036; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 26037; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 26038; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} 26039; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rsi), %xmm16 26040; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 26041; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %xmm17 26042; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 26043; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] 26044; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 26045; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 26046; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 26047; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} 26048; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] 26049; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 26050; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 26051; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 26052; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} 26053; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rsi), %xmm16 26054; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 26055; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %xmm17 26056; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 26057; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] 26058; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 26059; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 26060; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 26061; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} 26062; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] 26063; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 26064; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 26065; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} 26066; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rsi), %xmm17 26067; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 26068; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %xmm20 26069; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 26070; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] 26071; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 26072; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 26073; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} 26074; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] 26075; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 26076; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 26077; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} 26078; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rsi), %xmm20 26079; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 26080; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %xmm26 26081; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 26082; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] 26083; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 26084; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 26085; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} 26086; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] 26087; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 26088; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 26089; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 3776(%rax) 26090; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 3712(%rax) 26091; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 3264(%rax) 26092; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 3200(%rax) 26093; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 2752(%rax) 26094; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 2688(%rax) 26095; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 2240(%rax) 26096; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 2176(%rax) 26097; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 1728(%rax) 26098; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 1664(%rax) 26099; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 1216(%rax) 26100; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 1152(%rax) 26101; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 704(%rax) 26102; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26103; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 640(%rax) 26104; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26105; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 192(%rax) 26106; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload 26107; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 128(%rax) 26108; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26109; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 4032(%rax) 26110; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26111; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 3968(%rax) 26112; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26113; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 3904(%rax) 26114; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26115; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 3840(%rax) 26116; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 3648(%rax) 26117; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 3584(%rax) 26118; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26119; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 3520(%rax) 26120; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26121; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 3456(%rax) 26122; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26123; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 3392(%rax) 26124; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26125; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 3328(%rax) 26126; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 3136(%rax) 26127; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 3072(%rax) 26128; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26129; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 3008(%rax) 26130; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26131; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2944(%rax) 26132; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26133; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2880(%rax) 26134; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26135; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2816(%rax) 26136; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 2624(%rax) 26137; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 2560(%rax) 26138; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26139; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2496(%rax) 26140; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26141; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2432(%rax) 26142; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26143; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2368(%rax) 26144; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26145; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2304(%rax) 26146; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 2112(%rax) 26147; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 2048(%rax) 26148; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26149; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 1984(%rax) 26150; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26151; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 1920(%rax) 26152; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26153; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 1856(%rax) 26154; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 26155; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 1792(%rax) 26156; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 1600(%rax) 26157; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 1536(%rax) 26158; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 26159; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 1472(%rax) 26160; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 26161; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 1408(%rax) 26162; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 26163; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 1344(%rax) 26164; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 26165; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 1280(%rax) 26166; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 1088(%rax) 26167; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 1024(%rax) 26168; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 26169; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 960(%rax) 26170; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 26171; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 896(%rax) 26172; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 26173; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 832(%rax) 26174; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 26175; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 768(%rax) 26176; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 576(%rax) 26177; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 512(%rax) 26178; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 26179; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 448(%rax) 26180; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 26181; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 384(%rax) 26182; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 26183; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 320(%rax) 26184; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 26185; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 256(%rax) 26186; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) 26187; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 26188; AVX512DQ-BW-FCP-NEXT: addq $5384, %rsp # imm = 0x1508 26189; AVX512DQ-BW-FCP-NEXT: vzeroupper 26190; AVX512DQ-BW-FCP-NEXT: retq 26191 %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 26192 %in.vec1 = load <64 x i64>, ptr %in.vecptr1, align 64 26193 %in.vec2 = load <64 x i64>, ptr %in.vecptr2, align 64 26194 %in.vec3 = load <64 x i64>, ptr %in.vecptr3, align 64 26195 %in.vec4 = load <64 x i64>, ptr %in.vecptr4, align 64 26196 %in.vec5 = load <64 x i64>, ptr %in.vecptr5, align 64 26197 %in.vec6 = load <64 x i64>, ptr %in.vecptr6, align 64 26198 %in.vec7 = load <64 x i64>, ptr %in.vecptr7, align 64 26199 %1 = shufflevector <64 x i64> %in.vec0, <64 x i64> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 26200 %2 = shufflevector <64 x i64> %in.vec2, <64 x i64> %in.vec3, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 26201 %3 = shufflevector <64 x i64> %in.vec4, <64 x i64> %in.vec5, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 26202 %4 = shufflevector <64 x i64> %in.vec6, <64 x i64> %in.vec7, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 26203 %5 = shufflevector <128 x i64> %1, <128 x i64> %2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> 26204 %6 = shufflevector <128 x i64> %3, <128 x i64> %4, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> 26205 %7 = shufflevector <256 x i64> %5, <256 x i64> %6, <512 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255, i32 256, i32 257, i32 258, i32 259, i32 260, i32 261, i32 262, i32 263, i32 264, i32 265, i32 266, i32 267, i32 268, i32 269, i32 270, i32 271, i32 272, i32 273, i32 274, i32 275, i32 276, i32 277, i32 278, i32 279, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 304, i32 305, i32 306, i32 307, i32 308, i32 309, i32 310, i32 311, i32 312, i32 313, i32 314, i32 315, i32 316, i32 317, i32 318, i32 319, i32 320, i32 321, i32 322, i32 323, i32 324, i32 325, i32 326, i32 327, i32 328, i32 329, i32 330, i32 331, i32 332, i32 333, i32 334, i32 335, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 359, i32 360, i32 361, i32 362, i32 363, i32 364, i32 365, i32 366, i32 367, i32 368, i32 369, i32 370, i32 371, i32 372, i32 373, i32 374, i32 375, i32 376, i32 377, i32 378, i32 379, i32 380, i32 381, i32 382, i32 383, i32 384, i32 385, i32 386, i32 387, i32 388, i32 389, i32 390, i32 391, i32 392, i32 393, i32 394, i32 395, i32 396, i32 397, i32 398, i32 399, i32 400, i32 401, i32 402, i32 403, i32 404, i32 405, i32 406, i32 407, i32 408, i32 409, i32 410, i32 411, i32 412, i32 413, i32 414, i32 415, i32 416, i32 417, i32 418, i32 419, i32 420, i32 421, i32 422, i32 423, i32 424, i32 425, i32 426, i32 427, i32 428, i32 429, i32 430, i32 431, i32 432, i32 433, i32 434, i32 435, i32 436, i32 437, i32 438, i32 439, i32 440, i32 441, i32 442, i32 443, i32 444, i32 445, i32 446, i32 447, i32 448, i32 449, i32 450, i32 451, i32 452, i32 453, i32 454, i32 455, i32 456, i32 457, i32 458, i32 459, i32 460, i32 461, i32 462, i32 463, i32 464, i32 465, i32 466, i32 467, i32 468, i32 469, i32 470, i32 471, i32 472, i32 473, i32 474, i32 475, i32 476, i32 477, i32 478, i32 479, i32 480, i32 481, i32 482, i32 483, i32 484, i32 485, i32 486, i32 487, i32 488, i32 489, i32 490, i32 491, i32 492, i32 493, i32 494, i32 495, i32 496, i32 497, i32 498, i32 499, i32 500, i32 501, i32 502, i32 503, i32 504, i32 505, i32 506, i32 507, i32 508, i32 509, i32 510, i32 511> 26206 %interleaved.vec = shufflevector <512 x i64> %7, <512 x i64> poison, <512 x i32> <i32 0, i32 64, i32 128, i32 192, i32 256, i32 320, i32 384, i32 448, i32 1, i32 65, i32 129, i32 193, i32 257, i32 321, i32 385, i32 449, i32 2, i32 66, i32 130, i32 194, i32 258, i32 322, i32 386, i32 450, i32 3, i32 67, i32 131, i32 195, i32 259, i32 323, i32 387, i32 451, i32 4, i32 68, i32 132, i32 196, i32 260, i32 324, i32 388, i32 452, i32 5, i32 69, i32 133, i32 197, i32 261, i32 325, i32 389, i32 453, i32 6, i32 70, i32 134, i32 198, i32 262, i32 326, i32 390, i32 454, i32 7, i32 71, i32 135, i32 199, i32 263, i32 327, i32 391, i32 455, i32 8, i32 72, i32 136, i32 200, i32 264, i32 328, i32 392, i32 456, i32 9, i32 73, i32 137, i32 201, i32 265, i32 329, i32 393, i32 457, i32 10, i32 74, i32 138, i32 202, i32 266, i32 330, i32 394, i32 458, i32 11, i32 75, i32 139, i32 203, i32 267, i32 331, i32 395, i32 459, i32 12, i32 76, i32 140, i32 204, i32 268, i32 332, i32 396, i32 460, i32 13, i32 77, i32 141, i32 205, i32 269, i32 333, i32 397, i32 461, i32 14, i32 78, i32 142, i32 206, i32 270, i32 334, i32 398, i32 462, i32 15, i32 79, i32 143, i32 207, i32 271, i32 335, i32 399, i32 463, i32 16, i32 80, i32 144, i32 208, i32 272, i32 336, i32 400, i32 464, i32 17, i32 81, i32 145, i32 209, i32 273, i32 337, i32 401, i32 465, i32 18, i32 82, i32 146, i32 210, i32 274, i32 338, i32 402, i32 466, i32 19, i32 83, i32 147, i32 211, i32 275, i32 339, i32 403, i32 467, i32 20, i32 84, i32 148, i32 212, i32 276, i32 340, i32 404, i32 468, i32 21, i32 85, i32 149, i32 213, i32 277, i32 341, i32 405, i32 469, i32 22, i32 86, i32 150, i32 214, i32 278, i32 342, i32 406, i32 470, i32 23, i32 87, i32 151, i32 215, i32 279, i32 343, i32 407, i32 471, i32 24, i32 88, i32 152, i32 216, i32 280, i32 344, i32 408, i32 472, i32 25, i32 89, i32 153, i32 217, i32 281, i32 345, i32 409, i32 473, i32 26, i32 90, i32 154, i32 218, i32 282, i32 346, i32 410, i32 474, i32 27, i32 91, i32 155, i32 219, i32 283, i32 347, i32 411, i32 475, i32 28, i32 92, i32 156, i32 220, i32 284, i32 348, i32 412, i32 476, i32 29, i32 93, i32 157, i32 221, i32 285, i32 349, i32 413, i32 477, i32 30, i32 94, i32 158, i32 222, i32 286, i32 350, i32 414, i32 478, i32 31, i32 95, i32 159, i32 223, i32 287, i32 351, i32 415, i32 479, i32 32, i32 96, i32 160, i32 224, i32 288, i32 352, i32 416, i32 480, i32 33, i32 97, i32 161, i32 225, i32 289, i32 353, i32 417, i32 481, i32 34, i32 98, i32 162, i32 226, i32 290, i32 354, i32 418, i32 482, i32 35, i32 99, i32 163, i32 227, i32 291, i32 355, i32 419, i32 483, i32 36, i32 100, i32 164, i32 228, i32 292, i32 356, i32 420, i32 484, i32 37, i32 101, i32 165, i32 229, i32 293, i32 357, i32 421, i32 485, i32 38, i32 102, i32 166, i32 230, i32 294, i32 358, i32 422, i32 486, i32 39, i32 103, i32 167, i32 231, i32 295, i32 359, i32 423, i32 487, i32 40, i32 104, i32 168, i32 232, i32 296, i32 360, i32 424, i32 488, i32 41, i32 105, i32 169, i32 233, i32 297, i32 361, i32 425, i32 489, i32 42, i32 106, i32 170, i32 234, i32 298, i32 362, i32 426, i32 490, i32 43, i32 107, i32 171, i32 235, i32 299, i32 363, i32 427, i32 491, i32 44, i32 108, i32 172, i32 236, i32 300, i32 364, i32 428, i32 492, i32 45, i32 109, i32 173, i32 237, i32 301, i32 365, i32 429, i32 493, i32 46, i32 110, i32 174, i32 238, i32 302, i32 366, i32 430, i32 494, i32 47, i32 111, i32 175, i32 239, i32 303, i32 367, i32 431, i32 495, i32 48, i32 112, i32 176, i32 240, i32 304, i32 368, i32 432, i32 496, i32 49, i32 113, i32 177, i32 241, i32 305, i32 369, i32 433, i32 497, i32 50, i32 114, i32 178, i32 242, i32 306, i32 370, i32 434, i32 498, i32 51, i32 115, i32 179, i32 243, i32 307, i32 371, i32 435, i32 499, i32 52, i32 116, i32 180, i32 244, i32 308, i32 372, i32 436, i32 500, i32 53, i32 117, i32 181, i32 245, i32 309, i32 373, i32 437, i32 501, i32 54, i32 118, i32 182, i32 246, i32 310, i32 374, i32 438, i32 502, i32 55, i32 119, i32 183, i32 247, i32 311, i32 375, i32 439, i32 503, i32 56, i32 120, i32 184, i32 248, i32 312, i32 376, i32 440, i32 504, i32 57, i32 121, i32 185, i32 249, i32 313, i32 377, i32 441, i32 505, i32 58, i32 122, i32 186, i32 250, i32 314, i32 378, i32 442, i32 506, i32 59, i32 123, i32 187, i32 251, i32 315, i32 379, i32 443, i32 507, i32 60, i32 124, i32 188, i32 252, i32 316, i32 380, i32 444, i32 508, i32 61, i32 125, i32 189, i32 253, i32 317, i32 381, i32 445, i32 509, i32 62, i32 126, i32 190, i32 254, i32 318, i32 382, i32 446, i32 510, i32 63, i32 127, i32 191, i32 255, i32 319, i32 383, i32 447, i32 511> 26207 store <512 x i64> %interleaved.vec, ptr %out.vec, align 64 26208 ret void 26209} 26210