1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved loads. 17 18define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { 19; SSE-LABEL: load_i32_stride8_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 22; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 23; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 24; SSE-NEXT: movdqa (%rdi), %xmm0 25; SSE-NEXT: movdqa 16(%rdi), %xmm1 26; SSE-NEXT: movdqa 32(%rdi), %xmm2 27; SSE-NEXT: movdqa 48(%rdi), %xmm3 28; SSE-NEXT: movdqa %xmm0, %xmm4 29; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] 30; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] 31; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 32; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 33; SSE-NEXT: movdqa %xmm1, %xmm6 34; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] 35; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] 36; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 37; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 38; SSE-NEXT: movq %xmm4, (%rsi) 39; SSE-NEXT: movq %xmm5, (%rdx) 40; SSE-NEXT: movq %xmm0, (%rcx) 41; SSE-NEXT: movq %xmm2, (%r8) 42; SSE-NEXT: movq %xmm6, (%r9) 43; SSE-NEXT: movq %xmm7, (%r11) 44; SSE-NEXT: movq %xmm1, (%r10) 45; SSE-NEXT: movq %xmm3, (%rax) 46; SSE-NEXT: retq 47; 48; AVX-LABEL: load_i32_stride8_vf2: 49; AVX: # %bb.0: 50; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 51; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 52; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11 53; AVX-NEXT: vmovaps (%rdi), %ymm0 54; AVX-NEXT: vmovaps 32(%rdi), %ymm1 55; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 56; AVX-NEXT: vmovdqa (%rdi), %xmm3 57; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 58; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] 59; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3],xmm5[4,5,6,7] 60; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] 61; AVX-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 62; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 63; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] 64; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6 65; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3] 66; AVX-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 67; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7 68; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] 69; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 70; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3] 71; AVX-NEXT: vmovq %xmm4, (%rsi) 72; AVX-NEXT: vmovq %xmm5, (%rdx) 73; AVX-NEXT: vmovq %xmm2, (%rcx) 74; AVX-NEXT: vpextrq $1, %xmm2, (%r8) 75; AVX-NEXT: vmovlps %xmm3, (%r9) 76; AVX-NEXT: vmovlps %xmm6, (%r11) 77; AVX-NEXT: vmovlps %xmm7, (%r10) 78; AVX-NEXT: vmovlps %xmm0, (%rax) 79; AVX-NEXT: vzeroupper 80; AVX-NEXT: retq 81; 82; AVX2-LABEL: load_i32_stride8_vf2: 83; AVX2: # %bb.0: 84; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 85; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 86; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 87; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 88; AVX2-NEXT: vmovaps (%rdi), %ymm1 89; AVX2-NEXT: vmovdqa (%rdi), %xmm2 90; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3 91; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 92; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] 93; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3] 94; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] 95; AVX2-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 96; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3 97; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5] 98; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7] 99; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 100; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] 101; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 102; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 103; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 104; AVX2-NEXT: vmovq %xmm4, (%rsi) 105; AVX2-NEXT: vmovq %xmm5, (%rdx) 106; AVX2-NEXT: vmovq %xmm2, (%rcx) 107; AVX2-NEXT: vpextrq $1, %xmm2, (%r8) 108; AVX2-NEXT: vmovlps %xmm3, (%r9) 109; AVX2-NEXT: vmovlps %xmm6, (%r11) 110; AVX2-NEXT: vmovlps %xmm1, (%r10) 111; AVX2-NEXT: vmovlps %xmm0, (%rax) 112; AVX2-NEXT: vzeroupper 113; AVX2-NEXT: retq 114; 115; AVX2-FP-LABEL: load_i32_stride8_vf2: 116; AVX2-FP: # %bb.0: 117; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 118; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 119; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11 120; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0 121; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1 122; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2 123; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm3 124; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 125; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] 126; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3] 127; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] 128; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 129; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm3 130; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5] 131; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7] 132; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 133; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] 134; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm1 135; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 136; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 137; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) 138; AVX2-FP-NEXT: vmovq %xmm5, (%rdx) 139; AVX2-FP-NEXT: vmovq %xmm2, (%rcx) 140; AVX2-FP-NEXT: vpextrq $1, %xmm2, (%r8) 141; AVX2-FP-NEXT: vmovlps %xmm3, (%r9) 142; AVX2-FP-NEXT: vmovlps %xmm6, (%r11) 143; AVX2-FP-NEXT: vmovlps %xmm1, (%r10) 144; AVX2-FP-NEXT: vmovlps %xmm0, (%rax) 145; AVX2-FP-NEXT: vzeroupper 146; AVX2-FP-NEXT: retq 147; 148; AVX2-FCP-LABEL: load_i32_stride8_vf2: 149; AVX2-FCP: # %bb.0: 150; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 151; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 152; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 153; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0 154; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1 155; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2 156; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 157; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 158; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] 159; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3] 160; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] 161; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 162; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm3 163; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5] 164; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7] 165; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 166; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] 167; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 168; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 169; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 170; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi) 171; AVX2-FCP-NEXT: vmovq %xmm5, (%rdx) 172; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx) 173; AVX2-FCP-NEXT: vpextrq $1, %xmm2, (%r8) 174; AVX2-FCP-NEXT: vmovlps %xmm3, (%r9) 175; AVX2-FCP-NEXT: vmovlps %xmm6, (%r11) 176; AVX2-FCP-NEXT: vmovlps %xmm1, (%r10) 177; AVX2-FCP-NEXT: vmovlps %xmm0, (%rax) 178; AVX2-FCP-NEXT: vzeroupper 179; AVX2-FCP-NEXT: retq 180; 181; AVX512-LABEL: load_i32_stride8_vf2: 182; AVX512: # %bb.0: 183; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 184; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 185; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 186; AVX512-NEXT: vmovdqa (%rdi), %xmm0 187; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 188; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 189; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 190; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] 191; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 192; AVX512-NEXT: vmovaps 32(%rdi), %ymm1 193; AVX512-NEXT: vmovaps (%rdi), %ymm4 194; AVX512-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] 195; AVX512-NEXT: vextractf128 $1, %ymm5, %xmm5 196; AVX512-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] 197; AVX512-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] 198; AVX512-NEXT: vextractf128 $1, %ymm6, %xmm6 199; AVX512-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] 200; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm4 201; AVX512-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] 202; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 203; AVX512-NEXT: vmovq %xmm2, (%rsi) 204; AVX512-NEXT: vmovq %xmm3, (%rdx) 205; AVX512-NEXT: vmovq %xmm0, (%rcx) 206; AVX512-NEXT: vpextrq $1, %xmm0, (%r8) 207; AVX512-NEXT: vmovlps %xmm5, (%r9) 208; AVX512-NEXT: vmovlps %xmm6, (%r11) 209; AVX512-NEXT: vmovlps %xmm4, (%r10) 210; AVX512-NEXT: vmovlps %xmm1, (%rax) 211; AVX512-NEXT: vzeroupper 212; AVX512-NEXT: retq 213; 214; AVX512-FCP-LABEL: load_i32_stride8_vf2: 215; AVX512-FCP: # %bb.0: 216; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 217; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 218; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 219; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 220; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 221; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 222; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] 223; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 224; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 225; AVX512-FCP-NEXT: vmovaps 32(%rdi), %ymm1 226; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm4 227; AVX512-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] 228; AVX512-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 229; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5] 230; AVX512-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 231; AVX512-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 232; AVX512-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] 233; AVX512-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 234; AVX512-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] 235; AVX512-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 236; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) 237; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) 238; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) 239; AVX512-FCP-NEXT: vpextrq $1, %xmm0, (%r8) 240; AVX512-FCP-NEXT: vmovlps %xmm5, (%r9) 241; AVX512-FCP-NEXT: vmovlps %xmm6, (%r11) 242; AVX512-FCP-NEXT: vmovlps %xmm4, (%r10) 243; AVX512-FCP-NEXT: vmovlps %xmm1, (%rax) 244; AVX512-FCP-NEXT: vzeroupper 245; AVX512-FCP-NEXT: retq 246; 247; AVX512DQ-LABEL: load_i32_stride8_vf2: 248; AVX512DQ: # %bb.0: 249; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 250; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 251; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 252; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 253; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1 254; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 255; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 256; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] 257; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 258; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm1 259; AVX512DQ-NEXT: vmovaps (%rdi), %ymm4 260; AVX512DQ-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] 261; AVX512DQ-NEXT: vextractf128 $1, %ymm5, %xmm5 262; AVX512DQ-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] 263; AVX512DQ-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] 264; AVX512DQ-NEXT: vextractf128 $1, %ymm6, %xmm6 265; AVX512DQ-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] 266; AVX512DQ-NEXT: vextractf128 $1, %ymm1, %xmm4 267; AVX512DQ-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] 268; AVX512DQ-NEXT: vextractf128 $1, %ymm1, %xmm1 269; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) 270; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) 271; AVX512DQ-NEXT: vmovq %xmm0, (%rcx) 272; AVX512DQ-NEXT: vpextrq $1, %xmm0, (%r8) 273; AVX512DQ-NEXT: vmovlps %xmm5, (%r9) 274; AVX512DQ-NEXT: vmovlps %xmm6, (%r11) 275; AVX512DQ-NEXT: vmovlps %xmm4, (%r10) 276; AVX512DQ-NEXT: vmovlps %xmm1, (%rax) 277; AVX512DQ-NEXT: vzeroupper 278; AVX512DQ-NEXT: retq 279; 280; AVX512DQ-FCP-LABEL: load_i32_stride8_vf2: 281; AVX512DQ-FCP: # %bb.0: 282; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 283; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 284; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 285; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 286; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 287; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 288; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] 289; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 290; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 291; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %ymm1 292; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm4 293; AVX512DQ-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] 294; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 295; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5] 296; AVX512DQ-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 297; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 298; AVX512DQ-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] 299; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 300; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] 301; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 302; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) 303; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) 304; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) 305; AVX512DQ-FCP-NEXT: vpextrq $1, %xmm0, (%r8) 306; AVX512DQ-FCP-NEXT: vmovlps %xmm5, (%r9) 307; AVX512DQ-FCP-NEXT: vmovlps %xmm6, (%r11) 308; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%r10) 309; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rax) 310; AVX512DQ-FCP-NEXT: vzeroupper 311; AVX512DQ-FCP-NEXT: retq 312; 313; AVX512BW-LABEL: load_i32_stride8_vf2: 314; AVX512BW: # %bb.0: 315; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 316; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 317; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 318; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 319; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1 320; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 321; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 322; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] 323; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 324; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm1 325; AVX512BW-NEXT: vmovaps (%rdi), %ymm4 326; AVX512BW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] 327; AVX512BW-NEXT: vextractf128 $1, %ymm5, %xmm5 328; AVX512BW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] 329; AVX512BW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] 330; AVX512BW-NEXT: vextractf128 $1, %ymm6, %xmm6 331; AVX512BW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] 332; AVX512BW-NEXT: vextractf128 $1, %ymm1, %xmm4 333; AVX512BW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] 334; AVX512BW-NEXT: vextractf128 $1, %ymm1, %xmm1 335; AVX512BW-NEXT: vmovq %xmm2, (%rsi) 336; AVX512BW-NEXT: vmovq %xmm3, (%rdx) 337; AVX512BW-NEXT: vmovq %xmm0, (%rcx) 338; AVX512BW-NEXT: vpextrq $1, %xmm0, (%r8) 339; AVX512BW-NEXT: vmovlps %xmm5, (%r9) 340; AVX512BW-NEXT: vmovlps %xmm6, (%r11) 341; AVX512BW-NEXT: vmovlps %xmm4, (%r10) 342; AVX512BW-NEXT: vmovlps %xmm1, (%rax) 343; AVX512BW-NEXT: vzeroupper 344; AVX512BW-NEXT: retq 345; 346; AVX512BW-FCP-LABEL: load_i32_stride8_vf2: 347; AVX512BW-FCP: # %bb.0: 348; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 349; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 350; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 351; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 352; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 353; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 354; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] 355; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 356; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 357; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %ymm1 358; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm4 359; AVX512BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] 360; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 361; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5] 362; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 363; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 364; AVX512BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] 365; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 366; AVX512BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] 367; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 368; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) 369; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) 370; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) 371; AVX512BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) 372; AVX512BW-FCP-NEXT: vmovlps %xmm5, (%r9) 373; AVX512BW-FCP-NEXT: vmovlps %xmm6, (%r11) 374; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%r10) 375; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rax) 376; AVX512BW-FCP-NEXT: vzeroupper 377; AVX512BW-FCP-NEXT: retq 378; 379; AVX512DQ-BW-LABEL: load_i32_stride8_vf2: 380; AVX512DQ-BW: # %bb.0: 381; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 382; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 383; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 384; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 385; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1 386; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 387; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 388; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] 389; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 390; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm1 391; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm4 392; AVX512DQ-BW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] 393; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm5, %xmm5 394; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] 395; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] 396; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm6, %xmm6 397; AVX512DQ-BW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] 398; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm1, %xmm4 399; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] 400; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm1, %xmm1 401; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) 402; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) 403; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) 404; AVX512DQ-BW-NEXT: vpextrq $1, %xmm0, (%r8) 405; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%r9) 406; AVX512DQ-BW-NEXT: vmovlps %xmm6, (%r11) 407; AVX512DQ-BW-NEXT: vmovlps %xmm4, (%r10) 408; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%rax) 409; AVX512DQ-BW-NEXT: vzeroupper 410; AVX512DQ-BW-NEXT: retq 411; 412; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf2: 413; AVX512DQ-BW-FCP: # %bb.0: 414; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 415; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 416; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 417; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 418; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 419; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 420; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] 421; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 422; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 423; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %ymm1 424; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm4 425; AVX512DQ-BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] 426; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 427; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5] 428; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 429; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 430; AVX512DQ-BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] 431; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 432; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] 433; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 434; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) 435; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) 436; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) 437; AVX512DQ-BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) 438; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm5, (%r9) 439; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm6, (%r11) 440; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%r10) 441; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rax) 442; AVX512DQ-BW-FCP-NEXT: vzeroupper 443; AVX512DQ-BW-FCP-NEXT: retq 444 %wide.vec = load <16 x i32>, ptr %in.vec, align 64 445 %strided.vec0 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 0, i32 8> 446 %strided.vec1 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 1, i32 9> 447 %strided.vec2 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 2, i32 10> 448 %strided.vec3 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 3, i32 11> 449 %strided.vec4 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 4, i32 12> 450 %strided.vec5 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 5, i32 13> 451 %strided.vec6 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 6, i32 14> 452 %strided.vec7 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 7, i32 15> 453 store <2 x i32> %strided.vec0, ptr %out.vec0, align 64 454 store <2 x i32> %strided.vec1, ptr %out.vec1, align 64 455 store <2 x i32> %strided.vec2, ptr %out.vec2, align 64 456 store <2 x i32> %strided.vec3, ptr %out.vec3, align 64 457 store <2 x i32> %strided.vec4, ptr %out.vec4, align 64 458 store <2 x i32> %strided.vec5, ptr %out.vec5, align 64 459 store <2 x i32> %strided.vec6, ptr %out.vec6, align 64 460 store <2 x i32> %strided.vec7, ptr %out.vec7, align 64 461 ret void 462} 463 464define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { 465; SSE-LABEL: load_i32_stride8_vf4: 466; SSE: # %bb.0: 467; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 468; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 469; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 470; SSE-NEXT: movaps 112(%rdi), %xmm3 471; SSE-NEXT: movaps 80(%rdi), %xmm2 472; SSE-NEXT: movaps (%rdi), %xmm1 473; SSE-NEXT: movaps 16(%rdi), %xmm0 474; SSE-NEXT: movaps 32(%rdi), %xmm4 475; SSE-NEXT: movaps 48(%rdi), %xmm5 476; SSE-NEXT: movaps 96(%rdi), %xmm6 477; SSE-NEXT: movaps 64(%rdi), %xmm7 478; SSE-NEXT: movaps %xmm7, %xmm8 479; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] 480; SSE-NEXT: movaps %xmm1, %xmm9 481; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] 482; SSE-NEXT: movaps %xmm9, %xmm10 483; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0] 484; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] 485; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] 486; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] 487; SSE-NEXT: movaps %xmm1, %xmm4 488; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] 489; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] 490; SSE-NEXT: movaps %xmm2, %xmm6 491; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] 492; SSE-NEXT: movaps %xmm0, %xmm7 493; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] 494; SSE-NEXT: movaps %xmm7, %xmm8 495; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0] 496; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] 497; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] 498; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] 499; SSE-NEXT: movaps %xmm0, %xmm3 500; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] 501; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 502; SSE-NEXT: movaps %xmm10, (%rsi) 503; SSE-NEXT: movaps %xmm9, (%rdx) 504; SSE-NEXT: movaps %xmm4, (%rcx) 505; SSE-NEXT: movaps %xmm1, (%r8) 506; SSE-NEXT: movaps %xmm8, (%r9) 507; SSE-NEXT: movaps %xmm7, (%r11) 508; SSE-NEXT: movaps %xmm3, (%r10) 509; SSE-NEXT: movaps %xmm0, (%rax) 510; SSE-NEXT: retq 511; 512; AVX-LABEL: load_i32_stride8_vf4: 513; AVX: # %bb.0: 514; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 515; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 516; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11 517; AVX-NEXT: vmovaps (%rdi), %ymm0 518; AVX-NEXT: vmovaps 32(%rdi), %ymm1 519; AVX-NEXT: vmovaps 64(%rdi), %ymm2 520; AVX-NEXT: vmovaps 96(%rdi), %ymm3 521; AVX-NEXT: vmovaps 32(%rdi), %xmm4 522; AVX-NEXT: vmovaps (%rdi), %xmm5 523; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 524; AVX-NEXT: vmovaps 96(%rdi), %xmm7 525; AVX-NEXT: vmovaps 64(%rdi), %xmm8 526; AVX-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] 527; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0] 528; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1] 529; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] 530; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] 531; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm7[2,2,2,2] 532; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3] 533; AVX-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] 534; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1],xmm10[2,3] 535; AVX-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] 536; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] 537; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] 538; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7 539; AVX-NEXT: vunpcklps {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 540; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 541; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,0] 542; AVX-NEXT: vunpcklps {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 543; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 544; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] 545; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10 546; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm10[2,0],xmm8[2,3] 547; AVX-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] 548; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10 549; AVX-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 550; AVX-NEXT: vextractf128 $1, %ymm11, %xmm11 551; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,0] 552; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] 553; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 554; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] 555; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 556; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] 557; AVX-NEXT: vmovaps %xmm6, (%rsi) 558; AVX-NEXT: vmovaps %xmm9, (%rdx) 559; AVX-NEXT: vmovaps %xmm5, (%rcx) 560; AVX-NEXT: vmovaps %xmm4, (%r8) 561; AVX-NEXT: vmovaps %xmm7, (%r9) 562; AVX-NEXT: vmovaps %xmm8, (%r11) 563; AVX-NEXT: vmovaps %xmm10, (%r10) 564; AVX-NEXT: vmovaps %xmm0, (%rax) 565; AVX-NEXT: vzeroupper 566; AVX-NEXT: retq 567; 568; AVX2-LABEL: load_i32_stride8_vf4: 569; AVX2: # %bb.0: 570; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 571; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 572; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 573; AVX2-NEXT: vmovaps 96(%rdi), %ymm0 574; AVX2-NEXT: vmovaps 64(%rdi), %ymm1 575; AVX2-NEXT: vmovaps 32(%rdi), %ymm2 576; AVX2-NEXT: vmovaps (%rdi), %ymm3 577; AVX2-NEXT: vmovaps 96(%rdi), %xmm4 578; AVX2-NEXT: vbroadcastss %xmm4, %xmm5 579; AVX2-NEXT: vmovaps (%rdi), %xmm6 580; AVX2-NEXT: vmovaps 32(%rdi), %xmm7 581; AVX2-NEXT: vmovaps 64(%rdi), %xmm8 582; AVX2-NEXT: vbroadcastss %xmm8, %xmm9 583; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] 584; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 585; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] 586; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] 587; AVX2-NEXT: vshufps {{.*#+}} xmm10 = xmm6[1,1,1,1] 588; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm7[1],xmm10[2,3] 589; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] 590; AVX2-NEXT: vshufps {{.*#+}} xmm10 = xmm4[2,2,2,2] 591; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3] 592; AVX2-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] 593; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm10[2,3] 594; AVX2-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] 595; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm4[1] 596; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 597; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 598; AVX2-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 599; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm8[2,2,2,2] 600; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm10[2,3] 601; AVX2-NEXT: vextractf128 $1, %ymm8, %xmm8 602; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,1,1,1,5,5,5,5] 603; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7] 604; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm10 605; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] 606; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] 607; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 608; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] 609; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] 610; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] 611; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 612; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] 613; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 614; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] 615; AVX2-NEXT: vmovaps %xmm5, (%rsi) 616; AVX2-NEXT: vmovaps %xmm9, (%rdx) 617; AVX2-NEXT: vmovaps %xmm7, (%rcx) 618; AVX2-NEXT: vmovaps %xmm4, (%r8) 619; AVX2-NEXT: vmovaps %xmm6, (%r9) 620; AVX2-NEXT: vmovaps %xmm8, (%r11) 621; AVX2-NEXT: vmovaps %xmm1, (%r10) 622; AVX2-NEXT: vmovaps %xmm0, (%rax) 623; AVX2-NEXT: vzeroupper 624; AVX2-NEXT: retq 625; 626; AVX2-FP-LABEL: load_i32_stride8_vf4: 627; AVX2-FP: # %bb.0: 628; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 629; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 630; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11 631; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm0 632; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm1 633; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm2 634; AVX2-FP-NEXT: vmovaps (%rdi), %ymm3 635; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm4 636; AVX2-FP-NEXT: vbroadcastss %xmm4, %xmm5 637; AVX2-FP-NEXT: vmovaps (%rdi), %xmm6 638; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm7 639; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm8 640; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm9 641; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] 642; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 643; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] 644; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] 645; AVX2-FP-NEXT: vshufps {{.*#+}} xmm10 = xmm6[1,1,1,1] 646; AVX2-FP-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm7[1],xmm10[2,3] 647; AVX2-FP-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] 648; AVX2-FP-NEXT: vshufps {{.*#+}} xmm10 = xmm4[2,2,2,2] 649; AVX2-FP-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3] 650; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] 651; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm10[2,3] 652; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] 653; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm4[1] 654; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 655; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 656; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 657; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm8[2,2,2,2] 658; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm10[2,3] 659; AVX2-FP-NEXT: vextractf128 $1, %ymm8, %xmm8 660; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,1,1,1,5,5,5,5] 661; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7] 662; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm10 663; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] 664; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] 665; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm3 666; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] 667; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] 668; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] 669; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 670; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] 671; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 672; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] 673; AVX2-FP-NEXT: vmovaps %xmm5, (%rsi) 674; AVX2-FP-NEXT: vmovaps %xmm9, (%rdx) 675; AVX2-FP-NEXT: vmovaps %xmm7, (%rcx) 676; AVX2-FP-NEXT: vmovaps %xmm4, (%r8) 677; AVX2-FP-NEXT: vmovaps %xmm6, (%r9) 678; AVX2-FP-NEXT: vmovaps %xmm8, (%r11) 679; AVX2-FP-NEXT: vmovaps %xmm1, (%r10) 680; AVX2-FP-NEXT: vmovaps %xmm0, (%rax) 681; AVX2-FP-NEXT: vzeroupper 682; AVX2-FP-NEXT: retq 683; 684; AVX2-FCP-LABEL: load_i32_stride8_vf4: 685; AVX2-FCP: # %bb.0: 686; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 687; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 688; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 689; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm0 690; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm1 691; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm2 692; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm3 693; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm4 694; AVX2-FCP-NEXT: vbroadcastss %xmm4, %xmm5 695; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm6 696; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm7 697; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm8 698; AVX2-FCP-NEXT: vbroadcastss %xmm8, %xmm9 699; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] 700; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 701; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] 702; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] 703; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm10 = xmm6[1,1,1,1] 704; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm7[1],xmm10[2,3] 705; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] 706; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm10 = xmm4[2,2,2,2] 707; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3] 708; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] 709; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm10[2,3] 710; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] 711; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm4[1] 712; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 713; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 714; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 715; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm8[2,2,2,2] 716; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm10[2,3] 717; AVX2-FCP-NEXT: vextractf128 $1, %ymm8, %xmm8 718; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,1,1,1,5,5,5,5] 719; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7] 720; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm10 721; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] 722; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] 723; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm3 724; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] 725; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] 726; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] 727; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 728; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] 729; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 730; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] 731; AVX2-FCP-NEXT: vmovaps %xmm5, (%rsi) 732; AVX2-FCP-NEXT: vmovaps %xmm9, (%rdx) 733; AVX2-FCP-NEXT: vmovaps %xmm7, (%rcx) 734; AVX2-FCP-NEXT: vmovaps %xmm4, (%r8) 735; AVX2-FCP-NEXT: vmovaps %xmm6, (%r9) 736; AVX2-FCP-NEXT: vmovaps %xmm8, (%r11) 737; AVX2-FCP-NEXT: vmovaps %xmm1, (%r10) 738; AVX2-FCP-NEXT: vmovaps %xmm0, (%rax) 739; AVX2-FCP-NEXT: vzeroupper 740; AVX2-FCP-NEXT: retq 741; 742; AVX512-LABEL: load_i32_stride8_vf4: 743; AVX512: # %bb.0: 744; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 745; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 746; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 747; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] 748; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 749; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 750; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 751; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] 752; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 753; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] 754; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 755; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] 756; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 757; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] 758; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 759; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] 760; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 761; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] 762; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 763; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] 764; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 765; AVX512-NEXT: vmovdqa %xmm0, (%rsi) 766; AVX512-NEXT: vmovdqa %xmm3, (%rdx) 767; AVX512-NEXT: vmovdqa %xmm4, (%rcx) 768; AVX512-NEXT: vmovdqa %xmm5, (%r8) 769; AVX512-NEXT: vmovdqa %xmm6, (%r9) 770; AVX512-NEXT: vmovdqa %xmm7, (%r11) 771; AVX512-NEXT: vmovdqa %xmm8, (%r10) 772; AVX512-NEXT: vmovdqa %xmm9, (%rax) 773; AVX512-NEXT: vzeroupper 774; AVX512-NEXT: retq 775; 776; AVX512-FCP-LABEL: load_i32_stride8_vf4: 777; AVX512-FCP: # %bb.0: 778; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 779; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 780; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 781; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] 782; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 783; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 784; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 785; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] 786; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 787; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] 788; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 789; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] 790; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 791; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] 792; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 793; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] 794; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 795; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] 796; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 797; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] 798; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 799; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi) 800; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) 801; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rcx) 802; AVX512-FCP-NEXT: vmovdqa %xmm5, (%r8) 803; AVX512-FCP-NEXT: vmovdqa %xmm6, (%r9) 804; AVX512-FCP-NEXT: vmovdqa %xmm7, (%r11) 805; AVX512-FCP-NEXT: vmovdqa %xmm8, (%r10) 806; AVX512-FCP-NEXT: vmovdqa %xmm9, (%rax) 807; AVX512-FCP-NEXT: vzeroupper 808; AVX512-FCP-NEXT: retq 809; 810; AVX512DQ-LABEL: load_i32_stride8_vf4: 811; AVX512DQ: # %bb.0: 812; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 813; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 814; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 815; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] 816; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1 817; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2 818; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 819; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] 820; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 821; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] 822; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 823; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] 824; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 825; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] 826; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 827; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] 828; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 829; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] 830; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 831; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] 832; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 833; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi) 834; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) 835; AVX512DQ-NEXT: vmovdqa %xmm4, (%rcx) 836; AVX512DQ-NEXT: vmovdqa %xmm5, (%r8) 837; AVX512DQ-NEXT: vmovdqa %xmm6, (%r9) 838; AVX512DQ-NEXT: vmovdqa %xmm7, (%r11) 839; AVX512DQ-NEXT: vmovdqa %xmm8, (%r10) 840; AVX512DQ-NEXT: vmovdqa %xmm9, (%rax) 841; AVX512DQ-NEXT: vzeroupper 842; AVX512DQ-NEXT: retq 843; 844; AVX512DQ-FCP-LABEL: load_i32_stride8_vf4: 845; AVX512DQ-FCP: # %bb.0: 846; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 847; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 848; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 849; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] 850; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 851; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 852; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 853; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] 854; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 855; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] 856; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 857; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] 858; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 859; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] 860; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 861; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] 862; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 863; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] 864; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 865; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] 866; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 867; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi) 868; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) 869; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rcx) 870; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%r8) 871; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%r9) 872; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%r11) 873; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%r10) 874; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, (%rax) 875; AVX512DQ-FCP-NEXT: vzeroupper 876; AVX512DQ-FCP-NEXT: retq 877; 878; AVX512BW-LABEL: load_i32_stride8_vf4: 879; AVX512BW: # %bb.0: 880; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 881; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 882; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 883; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] 884; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 885; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 886; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 887; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] 888; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 889; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] 890; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 891; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] 892; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 893; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] 894; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 895; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] 896; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 897; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] 898; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 899; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] 900; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 901; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) 902; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) 903; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx) 904; AVX512BW-NEXT: vmovdqa %xmm5, (%r8) 905; AVX512BW-NEXT: vmovdqa %xmm6, (%r9) 906; AVX512BW-NEXT: vmovdqa %xmm7, (%r11) 907; AVX512BW-NEXT: vmovdqa %xmm8, (%r10) 908; AVX512BW-NEXT: vmovdqa %xmm9, (%rax) 909; AVX512BW-NEXT: vzeroupper 910; AVX512BW-NEXT: retq 911; 912; AVX512BW-FCP-LABEL: load_i32_stride8_vf4: 913; AVX512BW-FCP: # %bb.0: 914; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 915; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 916; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 917; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] 918; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 919; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 920; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 921; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] 922; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 923; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] 924; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 925; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] 926; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 927; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] 928; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 929; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] 930; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 931; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] 932; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 933; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] 934; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 935; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) 936; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) 937; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) 938; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r8) 939; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r9) 940; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%r11) 941; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r10) 942; AVX512BW-FCP-NEXT: vmovdqa %xmm9, (%rax) 943; AVX512BW-FCP-NEXT: vzeroupper 944; AVX512BW-FCP-NEXT: retq 945; 946; AVX512DQ-BW-LABEL: load_i32_stride8_vf4: 947; AVX512DQ-BW: # %bb.0: 948; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 949; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 950; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 951; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] 952; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 953; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 954; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 955; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] 956; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 957; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] 958; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 959; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] 960; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 961; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] 962; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 963; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] 964; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 965; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] 966; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 967; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] 968; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 969; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi) 970; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) 971; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx) 972; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%r8) 973; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%r9) 974; AVX512DQ-BW-NEXT: vmovdqa %xmm7, (%r11) 975; AVX512DQ-BW-NEXT: vmovdqa %xmm8, (%r10) 976; AVX512DQ-BW-NEXT: vmovdqa %xmm9, (%rax) 977; AVX512DQ-BW-NEXT: vzeroupper 978; AVX512DQ-BW-NEXT: retq 979; 980; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf4: 981; AVX512DQ-BW-FCP: # %bb.0: 982; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 983; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 984; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 985; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] 986; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 987; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 988; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 989; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] 990; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 991; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] 992; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 993; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] 994; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 995; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] 996; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 997; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] 998; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 999; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] 1000; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 1001; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] 1002; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 1003; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) 1004; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) 1005; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) 1006; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r8) 1007; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r9) 1008; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%r11) 1009; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r10) 1010; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm9, (%rax) 1011; AVX512DQ-BW-FCP-NEXT: vzeroupper 1012; AVX512DQ-BW-FCP-NEXT: retq 1013 %wide.vec = load <32 x i32>, ptr %in.vec, align 64 1014 %strided.vec0 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24> 1015 %strided.vec1 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 1, i32 9, i32 17, i32 25> 1016 %strided.vec2 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 2, i32 10, i32 18, i32 26> 1017 %strided.vec3 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 3, i32 11, i32 19, i32 27> 1018 %strided.vec4 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 4, i32 12, i32 20, i32 28> 1019 %strided.vec5 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 5, i32 13, i32 21, i32 29> 1020 %strided.vec6 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 6, i32 14, i32 22, i32 30> 1021 %strided.vec7 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 7, i32 15, i32 23, i32 31> 1022 store <4 x i32> %strided.vec0, ptr %out.vec0, align 64 1023 store <4 x i32> %strided.vec1, ptr %out.vec1, align 64 1024 store <4 x i32> %strided.vec2, ptr %out.vec2, align 64 1025 store <4 x i32> %strided.vec3, ptr %out.vec3, align 64 1026 store <4 x i32> %strided.vec4, ptr %out.vec4, align 64 1027 store <4 x i32> %strided.vec5, ptr %out.vec5, align 64 1028 store <4 x i32> %strided.vec6, ptr %out.vec6, align 64 1029 store <4 x i32> %strided.vec7, ptr %out.vec7, align 64 1030 ret void 1031} 1032 1033define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { 1034; SSE-LABEL: load_i32_stride8_vf8: 1035; SSE: # %bb.0: 1036; SSE-NEXT: movaps 112(%rdi), %xmm15 1037; SSE-NEXT: movaps 176(%rdi), %xmm4 1038; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1039; SSE-NEXT: movaps 144(%rdi), %xmm9 1040; SSE-NEXT: movaps (%rdi), %xmm10 1041; SSE-NEXT: movaps 32(%rdi), %xmm1 1042; SSE-NEXT: movaps 96(%rdi), %xmm13 1043; SSE-NEXT: movaps 64(%rdi), %xmm11 1044; SSE-NEXT: movaps 160(%rdi), %xmm2 1045; SSE-NEXT: movaps 128(%rdi), %xmm6 1046; SSE-NEXT: movaps 224(%rdi), %xmm12 1047; SSE-NEXT: movaps 192(%rdi), %xmm0 1048; SSE-NEXT: movaps %xmm0, %xmm8 1049; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] 1050; SSE-NEXT: movaps %xmm6, %xmm5 1051; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] 1052; SSE-NEXT: movaps %xmm5, %xmm7 1053; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] 1054; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1055; SSE-NEXT: movaps %xmm11, %xmm14 1056; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] 1057; SSE-NEXT: movaps %xmm10, %xmm7 1058; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] 1059; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] 1060; SSE-NEXT: movaps %xmm7, %xmm8 1061; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm14[0] 1062; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1063; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] 1064; SSE-NEXT: movaps 240(%rdi), %xmm14 1065; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1066; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] 1067; SSE-NEXT: movaps 208(%rdi), %xmm12 1068; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] 1069; SSE-NEXT: movaps %xmm6, %xmm2 1070; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 1071; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1072; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] 1073; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] 1074; SSE-NEXT: movaps %xmm10, %xmm8 1075; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm11[0] 1076; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] 1077; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm11[1] 1078; SSE-NEXT: movaps %xmm12, %xmm0 1079; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] 1080; SSE-NEXT: movaps %xmm9, %xmm11 1081; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] 1082; SSE-NEXT: movaps %xmm11, %xmm13 1083; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] 1084; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] 1085; SSE-NEXT: movaps 80(%rdi), %xmm2 1086; SSE-NEXT: movaps %xmm2, %xmm1 1087; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] 1088; SSE-NEXT: movaps 16(%rdi), %xmm0 1089; SSE-NEXT: movaps 48(%rdi), %xmm3 1090; SSE-NEXT: movaps %xmm0, %xmm14 1091; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] 1092; SSE-NEXT: movaps %xmm14, %xmm4 1093; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] 1094; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1] 1095; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 1096; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] 1097; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 1098; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] 1099; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] 1100; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] 1101; SSE-NEXT: movaps %xmm9, %xmm1 1102; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] 1103; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] 1104; SSE-NEXT: movaps %xmm0, %xmm3 1105; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] 1106; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 1107; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1108; SSE-NEXT: movaps %xmm2, (%rsi) 1109; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1110; SSE-NEXT: movaps %xmm2, 16(%rsi) 1111; SSE-NEXT: movaps %xmm7, (%rdx) 1112; SSE-NEXT: movaps %xmm5, 16(%rdx) 1113; SSE-NEXT: movaps %xmm8, (%rcx) 1114; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1115; SSE-NEXT: movaps %xmm2, 16(%rcx) 1116; SSE-NEXT: movaps %xmm10, (%r8) 1117; SSE-NEXT: movaps %xmm6, 16(%r8) 1118; SSE-NEXT: movaps %xmm4, (%r9) 1119; SSE-NEXT: movaps %xmm13, 16(%r9) 1120; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 1121; SSE-NEXT: movaps %xmm14, (%rax) 1122; SSE-NEXT: movaps %xmm11, 16(%rax) 1123; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 1124; SSE-NEXT: movaps %xmm3, (%rax) 1125; SSE-NEXT: movaps %xmm1, 16(%rax) 1126; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 1127; SSE-NEXT: movaps %xmm9, 16(%rax) 1128; SSE-NEXT: movaps %xmm0, (%rax) 1129; SSE-NEXT: retq 1130; 1131; AVX-LABEL: load_i32_stride8_vf8: 1132; AVX: # %bb.0: 1133; AVX-NEXT: vmovaps (%rdi), %ymm0 1134; AVX-NEXT: vmovaps 32(%rdi), %ymm1 1135; AVX-NEXT: vmovaps 64(%rdi), %ymm2 1136; AVX-NEXT: vmovaps 96(%rdi), %ymm3 1137; AVX-NEXT: vmovaps 32(%rdi), %xmm8 1138; AVX-NEXT: vmovaps (%rdi), %xmm11 1139; AVX-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] 1140; AVX-NEXT: vmovaps 96(%rdi), %xmm9 1141; AVX-NEXT: vmovaps 64(%rdi), %xmm10 1142; AVX-NEXT: vmovaps 160(%rdi), %xmm14 1143; AVX-NEXT: vmovaps 128(%rdi), %xmm15 1144; AVX-NEXT: vunpcklps {{.*#+}} xmm5 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 1145; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 1146; AVX-NEXT: vmovaps 224(%rdi), %xmm12 1147; AVX-NEXT: vmovaps 192(%rdi), %xmm13 1148; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] 1149; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,1,0,1] 1150; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 1151; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] 1152; AVX-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] 1153; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] 1154; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] 1155; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1156; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm11[1,1,1,1] 1157; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3] 1158; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] 1159; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 1160; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm15[1,1,1,1] 1161; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3] 1162; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 1163; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm6[6,7] 1164; AVX-NEXT: vmovaps 160(%rdi), %ymm6 1165; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] 1166; AVX-NEXT: vmovaps 128(%rdi), %ymm7 1167; AVX-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] 1168; AVX-NEXT: vunpckhps {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] 1169; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm11 1170; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm12[2,2,2,2] 1171; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1,2],xmm14[3] 1172; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 1173; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] 1174; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm9[2,2,2,2] 1175; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm10[0,1,2],xmm14[3] 1176; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm8[0,1],xmm14[2,3] 1177; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm11[4,5,6,7] 1178; AVX-NEXT: vmovaps 192(%rdi), %ymm11 1179; AVX-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] 1180; AVX-NEXT: vmovaps 224(%rdi), %ymm10 1181; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] 1182; AVX-NEXT: vunpckhps {{.*#+}} xmm9 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] 1183; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 1184; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm15[2,3,2,3] 1185; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 1186; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] 1187; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm9[4,5,6,7] 1188; AVX-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] 1189; AVX-NEXT: vunpcklps {{.*#+}} ymm12 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] 1190; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,0],ymm12[4,5],ymm8[6,4] 1191; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] 1192; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 1193; AVX-NEXT: vunpcklps {{.*#+}} ymm13 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 1194; AVX-NEXT: vextractf128 $1, %ymm13, %xmm13 1195; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,0] 1196; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm8[4,5,6,7] 1197; AVX-NEXT: vunpcklps {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] 1198; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm6[1,0],ymm7[1,0],ymm6[5,4],ymm7[5,4] 1199; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm13[2,0],ymm8[2,3],ymm13[6,4],ymm8[6,7] 1200; AVX-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 1201; AVX-NEXT: vextractf128 $1, %ymm13, %xmm13 1202; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] 1203; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 1204; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm15[2,0],xmm13[2,3] 1205; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 1206; AVX-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] 1207; AVX-NEXT: vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] 1208; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,0],ymm15[4,5],ymm13[6,4] 1209; AVX-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] 1210; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 1211; AVX-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 1212; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 1213; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,0] 1214; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] 1215; AVX-NEXT: vunpckhps {{.*#+}} ymm10 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] 1216; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm7[3,0],ymm6[7,4],ymm7[7,4] 1217; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm10[2,3],ymm6[6,4],ymm10[6,7] 1218; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] 1219; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] 1220; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1221; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1222; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1223; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] 1224; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 1225; AVX-NEXT: vmovaps %ymm1, (%rsi) 1226; AVX-NEXT: vmovaps %ymm5, (%rdx) 1227; AVX-NEXT: vmovaps %ymm14, (%rcx) 1228; AVX-NEXT: vmovaps %ymm9, (%r8) 1229; AVX-NEXT: vmovaps %ymm12, (%r9) 1230; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 1231; AVX-NEXT: vmovaps %ymm8, (%rax) 1232; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 1233; AVX-NEXT: vmovaps %ymm4, (%rax) 1234; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 1235; AVX-NEXT: vmovaps %ymm0, (%rax) 1236; AVX-NEXT: vzeroupper 1237; AVX-NEXT: retq 1238; 1239; AVX2-LABEL: load_i32_stride8_vf8: 1240; AVX2: # %bb.0: 1241; AVX2-NEXT: vmovaps 96(%rdi), %ymm0 1242; AVX2-NEXT: vmovaps 64(%rdi), %ymm1 1243; AVX2-NEXT: vmovaps 32(%rdi), %ymm2 1244; AVX2-NEXT: vmovaps (%rdi), %ymm3 1245; AVX2-NEXT: vmovaps 160(%rdi), %xmm7 1246; AVX2-NEXT: vmovaps 128(%rdi), %xmm11 1247; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] 1248; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 1249; AVX2-NEXT: vmovaps 224(%rdi), %xmm8 1250; AVX2-NEXT: vbroadcastss %xmm8, %xmm5 1251; AVX2-NEXT: vmovaps 192(%rdi), %xmm10 1252; AVX2-NEXT: vbroadcastss %xmm10, %xmm6 1253; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 1254; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 1255; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 1256; AVX2-NEXT: vmovaps 96(%rdi), %xmm9 1257; AVX2-NEXT: vbroadcastss %xmm9, %xmm5 1258; AVX2-NEXT: vmovaps (%rdi), %xmm13 1259; AVX2-NEXT: vmovaps 32(%rdi), %xmm14 1260; AVX2-NEXT: vmovaps 64(%rdi), %xmm12 1261; AVX2-NEXT: vbroadcastss %xmm12, %xmm6 1262; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 1263; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] 1264; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] 1265; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 1266; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] 1267; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1] 1268; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3] 1269; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] 1270; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] 1271; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 1272; AVX2-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1] 1273; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] 1274; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 1275; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm6[6,7] 1276; AVX2-NEXT: vmovaps 224(%rdi), %ymm6 1277; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7] 1278; AVX2-NEXT: vunpckhps {{.*#+}} xmm15 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] 1279; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm7 1280; AVX2-NEXT: vshufps {{.*#+}} xmm11 = xmm8[2,2,2,2] 1281; AVX2-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] 1282; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 1283; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] 1284; AVX2-NEXT: vmovaps 192(%rdi), %ymm11 1285; AVX2-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] 1286; AVX2-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2] 1287; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1,2],xmm13[3] 1288; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] 1289; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] 1290; AVX2-NEXT: vmovaps 160(%rdi), %ymm13 1291; AVX2-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] 1292; AVX2-NEXT: vmovaps 128(%rdi), %ymm10 1293; AVX2-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 1294; AVX2-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,3,2,3] 1295; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 1296; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] 1297; AVX2-NEXT: vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] 1298; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm9[1] 1299; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1300; AVX2-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5] 1301; AVX2-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 1302; AVX2-NEXT: vextractf128 $1, %ymm12, %xmm12 1303; AVX2-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 1304; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] 1305; AVX2-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3] 1306; AVX2-NEXT: vunpcklps {{.*#+}} ymm15 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] 1307; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm15[0],ymm9[2],ymm15[2] 1308; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] 1309; AVX2-NEXT: vbroadcastss 148(%rdi), %ymm12 1310; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] 1311; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] 1312; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm14 1313; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5] 1314; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] 1315; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15 1316; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] 1317; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] 1318; AVX2-NEXT: vbroadcastss 248(%rdi), %ymm14 1319; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7] 1320; AVX2-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7] 1321; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] 1322; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 1323; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] 1324; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] 1325; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] 1326; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7] 1327; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 1328; AVX2-NEXT: vbroadcastss 220(%rdi), %ymm3 1329; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] 1330; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] 1331; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 1332; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] 1333; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 1334; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] 1335; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 1336; AVX2-NEXT: vmovaps %ymm4, (%rsi) 1337; AVX2-NEXT: vmovaps %ymm5, (%rdx) 1338; AVX2-NEXT: vmovaps %ymm7, (%rcx) 1339; AVX2-NEXT: vmovaps %ymm8, (%r8) 1340; AVX2-NEXT: vmovaps %ymm9, (%r9) 1341; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 1342; AVX2-NEXT: vmovaps %ymm12, (%rax) 1343; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 1344; AVX2-NEXT: vmovaps %ymm1, (%rax) 1345; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 1346; AVX2-NEXT: vmovaps %ymm0, (%rax) 1347; AVX2-NEXT: vzeroupper 1348; AVX2-NEXT: retq 1349; 1350; AVX2-FP-LABEL: load_i32_stride8_vf8: 1351; AVX2-FP: # %bb.0: 1352; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm0 1353; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm1 1354; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm2 1355; AVX2-FP-NEXT: vmovaps (%rdi), %ymm3 1356; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm7 1357; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm11 1358; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] 1359; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 1360; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm8 1361; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm5 1362; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm10 1363; AVX2-FP-NEXT: vbroadcastss %xmm10, %xmm6 1364; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 1365; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 1366; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 1367; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm9 1368; AVX2-FP-NEXT: vbroadcastss %xmm9, %xmm5 1369; AVX2-FP-NEXT: vmovaps (%rdi), %xmm13 1370; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm14 1371; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm12 1372; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm6 1373; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 1374; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] 1375; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] 1376; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 1377; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] 1378; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1] 1379; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3] 1380; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] 1381; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] 1382; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 1383; AVX2-FP-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1] 1384; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] 1385; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 1386; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm6[6,7] 1387; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm6 1388; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7] 1389; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm15 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] 1390; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm7 1391; AVX2-FP-NEXT: vshufps {{.*#+}} xmm11 = xmm8[2,2,2,2] 1392; AVX2-FP-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] 1393; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 1394; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] 1395; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm11 1396; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] 1397; AVX2-FP-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2] 1398; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1,2],xmm13[3] 1399; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] 1400; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] 1401; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm13 1402; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] 1403; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm10 1404; AVX2-FP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 1405; AVX2-FP-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,3,2,3] 1406; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 1407; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] 1408; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] 1409; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm9[1] 1410; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1411; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5] 1412; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 1413; AVX2-FP-NEXT: vextractf128 $1, %ymm12, %xmm12 1414; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 1415; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] 1416; AVX2-FP-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3] 1417; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] 1418; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm15[0],ymm9[2],ymm15[2] 1419; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] 1420; AVX2-FP-NEXT: vbroadcastss 148(%rdi), %ymm12 1421; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] 1422; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] 1423; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm14 1424; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5] 1425; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] 1426; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15 1427; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] 1428; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] 1429; AVX2-FP-NEXT: vbroadcastss 248(%rdi), %ymm14 1430; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7] 1431; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7] 1432; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] 1433; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm3 1434; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] 1435; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] 1436; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] 1437; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7] 1438; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 1439; AVX2-FP-NEXT: vbroadcastss 220(%rdi), %ymm3 1440; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] 1441; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] 1442; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 1443; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] 1444; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 1445; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] 1446; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 1447; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) 1448; AVX2-FP-NEXT: vmovaps %ymm5, (%rdx) 1449; AVX2-FP-NEXT: vmovaps %ymm7, (%rcx) 1450; AVX2-FP-NEXT: vmovaps %ymm8, (%r8) 1451; AVX2-FP-NEXT: vmovaps %ymm9, (%r9) 1452; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1453; AVX2-FP-NEXT: vmovaps %ymm12, (%rax) 1454; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1455; AVX2-FP-NEXT: vmovaps %ymm1, (%rax) 1456; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1457; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) 1458; AVX2-FP-NEXT: vzeroupper 1459; AVX2-FP-NEXT: retq 1460; 1461; AVX2-FCP-LABEL: load_i32_stride8_vf8: 1462; AVX2-FCP: # %bb.0: 1463; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm0 1464; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm1 1465; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm2 1466; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm3 1467; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm7 1468; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm11 1469; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] 1470; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 1471; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm8 1472; AVX2-FCP-NEXT: vbroadcastss %xmm8, %xmm5 1473; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm10 1474; AVX2-FCP-NEXT: vbroadcastss %xmm10, %xmm6 1475; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 1476; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 1477; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 1478; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm9 1479; AVX2-FCP-NEXT: vbroadcastss %xmm9, %xmm5 1480; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm13 1481; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm14 1482; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm12 1483; AVX2-FCP-NEXT: vbroadcastss %xmm12, %xmm6 1484; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 1485; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] 1486; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] 1487; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 1488; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] 1489; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1] 1490; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3] 1491; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] 1492; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] 1493; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 1494; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1] 1495; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] 1496; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 1497; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm6[6,7] 1498; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm6 1499; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7] 1500; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm15 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] 1501; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm7 1502; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm11 = xmm8[2,2,2,2] 1503; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] 1504; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 1505; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] 1506; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm11 1507; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] 1508; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2] 1509; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1,2],xmm13[3] 1510; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] 1511; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] 1512; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm13 1513; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] 1514; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm10 1515; AVX2-FCP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 1516; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,3,2,3] 1517; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 1518; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] 1519; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] 1520; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm9[1] 1521; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1522; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5] 1523; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 1524; AVX2-FCP-NEXT: vextractf128 $1, %ymm12, %xmm12 1525; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 1526; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] 1527; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3] 1528; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] 1529; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm15[0],ymm9[2],ymm15[2] 1530; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] 1531; AVX2-FCP-NEXT: vbroadcastss 148(%rdi), %ymm12 1532; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] 1533; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] 1534; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm14 1535; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5] 1536; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] 1537; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm15 1538; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] 1539; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] 1540; AVX2-FCP-NEXT: vbroadcastss 248(%rdi), %ymm14 1541; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7] 1542; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7] 1543; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] 1544; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm3 1545; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] 1546; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] 1547; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] 1548; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7] 1549; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 1550; AVX2-FCP-NEXT: vbroadcastss 220(%rdi), %ymm3 1551; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] 1552; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] 1553; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 1554; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] 1555; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 1556; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] 1557; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 1558; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) 1559; AVX2-FCP-NEXT: vmovaps %ymm5, (%rdx) 1560; AVX2-FCP-NEXT: vmovaps %ymm7, (%rcx) 1561; AVX2-FCP-NEXT: vmovaps %ymm8, (%r8) 1562; AVX2-FCP-NEXT: vmovaps %ymm9, (%r9) 1563; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1564; AVX2-FCP-NEXT: vmovaps %ymm12, (%rax) 1565; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1566; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax) 1567; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1568; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) 1569; AVX2-FCP-NEXT: vzeroupper 1570; AVX2-FCP-NEXT: retq 1571; 1572; AVX512-LABEL: load_i32_stride8_vf8: 1573; AVX512: # %bb.0: 1574; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 1575; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 1576; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 1577; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 1578; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 1579; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 1580; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 1581; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] 1582; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1583; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] 1584; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 1585; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 1586; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] 1587; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 1588; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] 1589; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 1590; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 1591; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] 1592; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1593; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] 1594; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 1595; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 1596; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] 1597; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 1598; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] 1599; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 1600; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] 1601; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] 1602; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 1603; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] 1604; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 1605; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1606; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] 1607; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 1608; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] 1609; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 1610; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 1611; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] 1612; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 1613; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] 1614; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 1615; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 1616; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] 1617; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 1618; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] 1619; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 1620; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] 1621; AVX512-NEXT: vmovdqa %ymm4, (%rsi) 1622; AVX512-NEXT: vmovdqa %ymm5, (%rdx) 1623; AVX512-NEXT: vmovdqa %ymm6, (%rcx) 1624; AVX512-NEXT: vmovdqa %ymm7, (%r8) 1625; AVX512-NEXT: vmovdqa %ymm8, (%r9) 1626; AVX512-NEXT: vmovdqa %ymm9, (%r11) 1627; AVX512-NEXT: vmovdqa %ymm10, (%r10) 1628; AVX512-NEXT: vmovdqa %ymm0, (%rax) 1629; AVX512-NEXT: vzeroupper 1630; AVX512-NEXT: retq 1631; 1632; AVX512-FCP-LABEL: load_i32_stride8_vf8: 1633; AVX512-FCP: # %bb.0: 1634; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1635; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1636; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 1637; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1638; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1639; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 1640; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 1641; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] 1642; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1643; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] 1644; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 1645; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 1646; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] 1647; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 1648; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] 1649; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 1650; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 1651; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] 1652; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1653; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] 1654; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 1655; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 1656; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] 1657; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 1658; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] 1659; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 1660; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] 1661; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] 1662; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 1663; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] 1664; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 1665; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1666; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] 1667; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 1668; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] 1669; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 1670; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 1671; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] 1672; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 1673; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] 1674; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 1675; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 1676; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] 1677; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 1678; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] 1679; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 1680; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] 1681; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rsi) 1682; AVX512-FCP-NEXT: vmovdqa %ymm5, (%rdx) 1683; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rcx) 1684; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r8) 1685; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r9) 1686; AVX512-FCP-NEXT: vmovdqa %ymm9, (%r11) 1687; AVX512-FCP-NEXT: vmovdqa %ymm10, (%r10) 1688; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax) 1689; AVX512-FCP-NEXT: vzeroupper 1690; AVX512-FCP-NEXT: retq 1691; 1692; AVX512DQ-LABEL: load_i32_stride8_vf8: 1693; AVX512DQ: # %bb.0: 1694; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 1695; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 1696; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 1697; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 1698; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 1699; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 1700; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 1701; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] 1702; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1703; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] 1704; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 1705; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 1706; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] 1707; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 1708; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] 1709; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 1710; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 1711; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] 1712; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1713; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] 1714; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 1715; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 1716; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] 1717; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 1718; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] 1719; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 1720; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] 1721; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] 1722; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 1723; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] 1724; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 1725; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1726; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] 1727; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 1728; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] 1729; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 1730; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 1731; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] 1732; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 1733; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] 1734; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 1735; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 1736; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] 1737; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 1738; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] 1739; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 1740; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] 1741; AVX512DQ-NEXT: vmovdqa %ymm4, (%rsi) 1742; AVX512DQ-NEXT: vmovdqa %ymm5, (%rdx) 1743; AVX512DQ-NEXT: vmovdqa %ymm6, (%rcx) 1744; AVX512DQ-NEXT: vmovdqa %ymm7, (%r8) 1745; AVX512DQ-NEXT: vmovdqa %ymm8, (%r9) 1746; AVX512DQ-NEXT: vmovdqa %ymm9, (%r11) 1747; AVX512DQ-NEXT: vmovdqa %ymm10, (%r10) 1748; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax) 1749; AVX512DQ-NEXT: vzeroupper 1750; AVX512DQ-NEXT: retq 1751; 1752; AVX512DQ-FCP-LABEL: load_i32_stride8_vf8: 1753; AVX512DQ-FCP: # %bb.0: 1754; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1755; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1756; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 1757; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1758; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1759; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 1760; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 1761; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] 1762; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1763; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] 1764; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 1765; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 1766; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] 1767; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 1768; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] 1769; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 1770; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 1771; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] 1772; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1773; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] 1774; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 1775; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 1776; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] 1777; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 1778; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] 1779; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 1780; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] 1781; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] 1782; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 1783; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] 1784; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 1785; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1786; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] 1787; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 1788; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] 1789; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 1790; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 1791; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] 1792; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 1793; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] 1794; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 1795; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 1796; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] 1797; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 1798; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] 1799; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 1800; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] 1801; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rsi) 1802; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%rdx) 1803; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rcx) 1804; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r8) 1805; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r9) 1806; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%r11) 1807; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%r10) 1808; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax) 1809; AVX512DQ-FCP-NEXT: vzeroupper 1810; AVX512DQ-FCP-NEXT: retq 1811; 1812; AVX512BW-LABEL: load_i32_stride8_vf8: 1813; AVX512BW: # %bb.0: 1814; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1815; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 1816; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 1817; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1818; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 1819; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 1820; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 1821; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] 1822; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1823; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] 1824; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 1825; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 1826; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] 1827; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 1828; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] 1829; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 1830; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 1831; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] 1832; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1833; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] 1834; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 1835; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 1836; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] 1837; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 1838; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] 1839; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 1840; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] 1841; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] 1842; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 1843; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] 1844; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 1845; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1846; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] 1847; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 1848; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] 1849; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 1850; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 1851; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] 1852; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 1853; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] 1854; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 1855; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 1856; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] 1857; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 1858; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] 1859; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 1860; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] 1861; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi) 1862; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx) 1863; AVX512BW-NEXT: vmovdqa %ymm6, (%rcx) 1864; AVX512BW-NEXT: vmovdqa %ymm7, (%r8) 1865; AVX512BW-NEXT: vmovdqa %ymm8, (%r9) 1866; AVX512BW-NEXT: vmovdqa %ymm9, (%r11) 1867; AVX512BW-NEXT: vmovdqa %ymm10, (%r10) 1868; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) 1869; AVX512BW-NEXT: vzeroupper 1870; AVX512BW-NEXT: retq 1871; 1872; AVX512BW-FCP-LABEL: load_i32_stride8_vf8: 1873; AVX512BW-FCP: # %bb.0: 1874; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1875; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1876; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 1877; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1878; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1879; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 1880; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 1881; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] 1882; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1883; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] 1884; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 1885; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 1886; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] 1887; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 1888; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] 1889; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 1890; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 1891; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] 1892; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1893; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] 1894; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 1895; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 1896; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] 1897; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 1898; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] 1899; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 1900; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] 1901; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] 1902; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 1903; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] 1904; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 1905; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1906; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] 1907; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 1908; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] 1909; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 1910; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 1911; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] 1912; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 1913; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] 1914; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 1915; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 1916; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] 1917; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 1918; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] 1919; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 1920; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] 1921; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) 1922; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rdx) 1923; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) 1924; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r8) 1925; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r9) 1926; AVX512BW-FCP-NEXT: vmovdqa %ymm9, (%r11) 1927; AVX512BW-FCP-NEXT: vmovdqa %ymm10, (%r10) 1928; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) 1929; AVX512BW-FCP-NEXT: vzeroupper 1930; AVX512BW-FCP-NEXT: retq 1931; 1932; AVX512DQ-BW-LABEL: load_i32_stride8_vf8: 1933; AVX512DQ-BW: # %bb.0: 1934; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1935; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 1936; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 1937; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 1938; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 1939; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 1940; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 1941; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] 1942; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1943; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] 1944; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 1945; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 1946; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] 1947; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 1948; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] 1949; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 1950; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 1951; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] 1952; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1953; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] 1954; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 1955; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 1956; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] 1957; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 1958; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] 1959; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 1960; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] 1961; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] 1962; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 1963; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] 1964; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 1965; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1966; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] 1967; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 1968; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] 1969; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 1970; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 1971; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] 1972; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 1973; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] 1974; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 1975; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 1976; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] 1977; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 1978; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] 1979; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 1980; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] 1981; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rsi) 1982; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%rdx) 1983; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%rcx) 1984; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%r8) 1985; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%r9) 1986; AVX512DQ-BW-NEXT: vmovdqa %ymm9, (%r11) 1987; AVX512DQ-BW-NEXT: vmovdqa %ymm10, (%r10) 1988; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax) 1989; AVX512DQ-BW-NEXT: vzeroupper 1990; AVX512DQ-BW-NEXT: retq 1991; 1992; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf8: 1993; AVX512DQ-BW-FCP: # %bb.0: 1994; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1995; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1996; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 1997; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1998; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1999; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 2000; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 2001; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] 2002; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 2003; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] 2004; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 2005; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 2006; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] 2007; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 2008; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] 2009; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 2010; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 2011; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] 2012; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 2013; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] 2014; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 2015; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 2016; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] 2017; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 2018; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] 2019; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 2020; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] 2021; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] 2022; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 2023; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] 2024; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 2025; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 2026; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] 2027; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 2028; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] 2029; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 2030; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 2031; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] 2032; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 2033; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] 2034; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 2035; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 2036; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] 2037; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 2038; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] 2039; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 2040; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] 2041; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) 2042; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rdx) 2043; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) 2044; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r8) 2045; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r9) 2046; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, (%r11) 2047; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm10, (%r10) 2048; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) 2049; AVX512DQ-BW-FCP-NEXT: vzeroupper 2050; AVX512DQ-BW-FCP-NEXT: retq 2051 %wide.vec = load <64 x i32>, ptr %in.vec, align 64 2052 %strided.vec0 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56> 2053 %strided.vec1 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57> 2054 %strided.vec2 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58> 2055 %strided.vec3 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59> 2056 %strided.vec4 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60> 2057 %strided.vec5 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61> 2058 %strided.vec6 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62> 2059 %strided.vec7 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63> 2060 store <8 x i32> %strided.vec0, ptr %out.vec0, align 64 2061 store <8 x i32> %strided.vec1, ptr %out.vec1, align 64 2062 store <8 x i32> %strided.vec2, ptr %out.vec2, align 64 2063 store <8 x i32> %strided.vec3, ptr %out.vec3, align 64 2064 store <8 x i32> %strided.vec4, ptr %out.vec4, align 64 2065 store <8 x i32> %strided.vec5, ptr %out.vec5, align 64 2066 store <8 x i32> %strided.vec6, ptr %out.vec6, align 64 2067 store <8 x i32> %strided.vec7, ptr %out.vec7, align 64 2068 ret void 2069} 2070 2071define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { 2072; SSE-LABEL: load_i32_stride8_vf16: 2073; SSE: # %bb.0: 2074; SSE-NEXT: subq $296, %rsp # imm = 0x128 2075; SSE-NEXT: movaps 288(%rdi), %xmm6 2076; SSE-NEXT: movaps 352(%rdi), %xmm0 2077; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2078; SSE-NEXT: movaps 320(%rdi), %xmm5 2079; SSE-NEXT: movaps 416(%rdi), %xmm2 2080; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2081; SSE-NEXT: movaps 384(%rdi), %xmm12 2082; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2083; SSE-NEXT: movaps 480(%rdi), %xmm13 2084; SSE-NEXT: movaps 448(%rdi), %xmm4 2085; SSE-NEXT: movaps 160(%rdi), %xmm7 2086; SSE-NEXT: movaps 128(%rdi), %xmm10 2087; SSE-NEXT: movaps 224(%rdi), %xmm8 2088; SSE-NEXT: movaps 192(%rdi), %xmm3 2089; SSE-NEXT: movaps %xmm3, %xmm9 2090; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] 2091; SSE-NEXT: movaps %xmm10, %xmm11 2092; SSE-NEXT: movaps %xmm10, %xmm14 2093; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] 2094; SSE-NEXT: movaps %xmm11, %xmm10 2095; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] 2096; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2097; SSE-NEXT: movaps %xmm4, %xmm10 2098; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] 2099; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] 2100; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm9[1] 2101; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2102; SSE-NEXT: movaps %xmm12, %xmm9 2103; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] 2104; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2105; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] 2106; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2107; SSE-NEXT: movaps %xmm5, %xmm9 2108; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] 2109; SSE-NEXT: movaps 256(%rdi), %xmm15 2110; SSE-NEXT: movaps %xmm15, %xmm0 2111; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 2112; SSE-NEXT: movaps %xmm0, %xmm10 2113; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] 2114; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2115; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] 2116; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2117; SSE-NEXT: movaps 96(%rdi), %xmm10 2118; SSE-NEXT: movaps 64(%rdi), %xmm9 2119; SSE-NEXT: movaps %xmm9, %xmm11 2120; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] 2121; SSE-NEXT: movaps (%rdi), %xmm2 2122; SSE-NEXT: movaps 32(%rdi), %xmm12 2123; SSE-NEXT: movaps %xmm2, %xmm1 2124; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] 2125; SSE-NEXT: movaps %xmm1, %xmm0 2126; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] 2127; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2128; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] 2129; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2130; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] 2131; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm7[2],xmm14[3],xmm7[3] 2132; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] 2133; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 2134; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 2135; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] 2136; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 2137; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] 2138; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] 2139; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] 2140; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] 2141; SSE-NEXT: movaps %xmm14, %xmm0 2142; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 2143; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2144; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm3[1] 2145; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2146; SSE-NEXT: movaps %xmm13, %xmm0 2147; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] 2148; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2149; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] 2150; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2151; SSE-NEXT: movaps %xmm15, %xmm0 2152; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] 2153; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2154; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm5[1] 2155; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2156; SSE-NEXT: movaps %xmm2, %xmm0 2157; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] 2158; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2159; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] 2160; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2161; SSE-NEXT: movaps 240(%rdi), %xmm1 2162; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill 2163; SSE-NEXT: movaps 208(%rdi), %xmm15 2164; SSE-NEXT: movaps %xmm15, %xmm0 2165; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2166; SSE-NEXT: movaps 176(%rdi), %xmm2 2167; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2168; SSE-NEXT: movaps 144(%rdi), %xmm1 2169; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2170; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2171; SSE-NEXT: movaps %xmm1, %xmm2 2172; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 2173; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2174; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2175; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2176; SSE-NEXT: movaps 496(%rdi), %xmm1 2177; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2178; SSE-NEXT: movaps 464(%rdi), %xmm5 2179; SSE-NEXT: movaps %xmm5, %xmm0 2180; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2181; SSE-NEXT: movaps 432(%rdi), %xmm1 2182; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2183; SSE-NEXT: movaps 400(%rdi), %xmm6 2184; SSE-NEXT: movaps %xmm6, %xmm10 2185; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] 2186; SSE-NEXT: movaps %xmm10, %xmm1 2187; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2188; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2189; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] 2190; SSE-NEXT: movaps 368(%rdi), %xmm14 2191; SSE-NEXT: movaps 336(%rdi), %xmm2 2192; SSE-NEXT: movaps %xmm2, %xmm0 2193; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] 2194; SSE-NEXT: movaps 304(%rdi), %xmm12 2195; SSE-NEXT: movaps 272(%rdi), %xmm7 2196; SSE-NEXT: movaps %xmm7, %xmm4 2197; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] 2198; SSE-NEXT: movaps %xmm4, %xmm1 2199; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2200; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2201; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] 2202; SSE-NEXT: movaps 112(%rdi), %xmm13 2203; SSE-NEXT: movaps 80(%rdi), %xmm1 2204; SSE-NEXT: movaps %xmm1, %xmm0 2205; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] 2206; SSE-NEXT: movaps 16(%rdi), %xmm8 2207; SSE-NEXT: movaps 48(%rdi), %xmm11 2208; SSE-NEXT: movaps %xmm8, %xmm3 2209; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] 2210; SSE-NEXT: movaps %xmm3, %xmm9 2211; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] 2212; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2213; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 2214; SSE-NEXT: unpckhps (%rsp), %xmm15 # 16-byte Folded Reload 2215; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] 2216; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2217; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2218; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 2219; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] 2220; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] 2221; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 2222; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] 2223; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 2224; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] 2225; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] 2226; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] 2227; SSE-NEXT: movaps %xmm0, %xmm11 2228; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] 2229; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] 2230; SSE-NEXT: movaps %xmm0, %xmm12 2231; SSE-NEXT: movaps %xmm7, %xmm9 2232; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0] 2233; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] 2234; SSE-NEXT: movaps %xmm6, %xmm0 2235; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] 2236; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] 2237; SSE-NEXT: movaps %xmm8, %xmm2 2238; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] 2239; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] 2240; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2241; SSE-NEXT: movaps %xmm1, 32(%rsi) 2242; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2243; SSE-NEXT: movaps %xmm1, 48(%rsi) 2244; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2245; SSE-NEXT: movaps %xmm1, (%rsi) 2246; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2247; SSE-NEXT: movaps %xmm1, 16(%rsi) 2248; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2249; SSE-NEXT: movaps %xmm1, 32(%rdx) 2250; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2251; SSE-NEXT: movaps %xmm1, 48(%rdx) 2252; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2253; SSE-NEXT: movaps %xmm1, (%rdx) 2254; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2255; SSE-NEXT: movaps %xmm1, 16(%rdx) 2256; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2257; SSE-NEXT: movaps %xmm1, 32(%rcx) 2258; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2259; SSE-NEXT: movaps %xmm1, 48(%rcx) 2260; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2261; SSE-NEXT: movaps %xmm1, (%rcx) 2262; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2263; SSE-NEXT: movaps %xmm1, 16(%rcx) 2264; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2265; SSE-NEXT: movaps %xmm1, 32(%r8) 2266; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2267; SSE-NEXT: movaps %xmm1, 48(%r8) 2268; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2269; SSE-NEXT: movaps %xmm1, (%r8) 2270; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2271; SSE-NEXT: movaps %xmm1, 16(%r8) 2272; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2273; SSE-NEXT: movaps %xmm1, 32(%r9) 2274; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2275; SSE-NEXT: movaps %xmm1, 48(%r9) 2276; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2277; SSE-NEXT: movaps %xmm1, (%r9) 2278; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2279; SSE-NEXT: movaps %xmm1, 16(%r9) 2280; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 2281; SSE-NEXT: movaps %xmm4, 32(%rax) 2282; SSE-NEXT: movaps %xmm10, 48(%rax) 2283; SSE-NEXT: movaps %xmm3, (%rax) 2284; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2285; SSE-NEXT: movaps %xmm1, 16(%rax) 2286; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 2287; SSE-NEXT: movaps %xmm0, 48(%rax) 2288; SSE-NEXT: movaps %xmm9, 32(%rax) 2289; SSE-NEXT: movaps %xmm11, 16(%rax) 2290; SSE-NEXT: movaps %xmm2, (%rax) 2291; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 2292; SSE-NEXT: movaps %xmm6, 48(%rax) 2293; SSE-NEXT: movaps %xmm7, 32(%rax) 2294; SSE-NEXT: movaps %xmm12, 16(%rax) 2295; SSE-NEXT: movaps %xmm8, (%rax) 2296; SSE-NEXT: addq $296, %rsp # imm = 0x128 2297; SSE-NEXT: retq 2298; 2299; AVX-LABEL: load_i32_stride8_vf16: 2300; AVX: # %bb.0: 2301; AVX-NEXT: subq $584, %rsp # imm = 0x248 2302; AVX-NEXT: vmovaps 32(%rdi), %xmm0 2303; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2304; AVX-NEXT: vmovaps (%rdi), %xmm12 2305; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] 2306; AVX-NEXT: vmovaps 96(%rdi), %xmm1 2307; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2308; AVX-NEXT: vmovaps 64(%rdi), %xmm2 2309; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2310; AVX-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 2311; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm9[0] 2312; AVX-NEXT: vmovaps 160(%rdi), %xmm8 2313; AVX-NEXT: vmovaps 128(%rdi), %xmm10 2314; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] 2315; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 2316; AVX-NEXT: vmovaps 224(%rdi), %xmm0 2317; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2318; AVX-NEXT: vmovaps 192(%rdi), %xmm1 2319; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2320; AVX-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2321; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm5[0,1,0,1] 2322; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 2323; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] 2324; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm6[4,5,6,7] 2325; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2326; AVX-NEXT: vmovaps 416(%rdi), %xmm11 2327; AVX-NEXT: vmovaps 384(%rdi), %xmm13 2328; AVX-NEXT: vunpcklps {{.*#+}} xmm4 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] 2329; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 2330; AVX-NEXT: vmovaps 480(%rdi), %xmm0 2331; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2332; AVX-NEXT: vmovaps 448(%rdi), %xmm1 2333; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2334; AVX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2335; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm3[0,1,0,1] 2336; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 2337; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm6[6,7] 2338; AVX-NEXT: vmovaps 288(%rdi), %xmm14 2339; AVX-NEXT: vmovaps 256(%rdi), %xmm15 2340; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 2341; AVX-NEXT: vmovaps 352(%rdi), %xmm7 2342; AVX-NEXT: vmovaps 320(%rdi), %xmm6 2343; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 2344; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2345; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 2346; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2347; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,1,1] 2348; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2349; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] 2350; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3] 2351; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 2352; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm10[1,1,1,1] 2353; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3] 2354; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 2355; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] 2356; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 2357; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2358; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm15[1,1,1,1] 2359; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] 2360; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2361; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 2362; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm13[1,1,1,1] 2363; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] 2364; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 2365; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 2366; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2367; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2368; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] 2369; AVX-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] 2370; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2371; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,2,2,2] 2372; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 2373; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] 2374; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 2375; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 2376; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] 2377; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 2378; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] 2379; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 2380; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] 2381; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] 2382; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 2383; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2384; AVX-NEXT: vunpckhps {{.*#+}} xmm2 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] 2385; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] 2386; AVX-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload 2387; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,2,2,2] 2388; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 2389; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] 2390; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 2391; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm8 2392; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] 2393; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm7[2,2,2,2] 2394; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] 2395; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1],xmm8[2,3] 2396; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] 2397; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2398; AVX-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] 2399; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] 2400; AVX-NEXT: vunpckhps {{.*#+}} xmm5 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] 2401; AVX-NEXT: vmovaps 320(%rdi), %ymm8 2402; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 2403; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,3,2,3] 2404; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2405; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] 2406; AVX-NEXT: vmovaps 352(%rdi), %ymm5 2407; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2408; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 2409; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2410; AVX-NEXT: vmovaps 416(%rdi), %ymm4 2411; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2412; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] 2413; AVX-NEXT: vmovaps 384(%rdi), %ymm6 2414; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2415; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] 2416; AVX-NEXT: vmovaps 448(%rdi), %ymm7 2417; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2418; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] 2419; AVX-NEXT: vmovaps 480(%rdi), %ymm9 2420; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 2421; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,3,2,3] 2422; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 2423; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 2424; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2425; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2426; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] 2427; AVX-NEXT: vmovaps %ymm9, %ymm3 2428; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[4],ymm4[4],ymm6[5],ymm4[5] 2429; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] 2430; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm8[0],ymm5[2],ymm8[2] 2431; AVX-NEXT: vmovaps %ymm8, %ymm6 2432; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 2433; AVX-NEXT: vmovaps 288(%rdi), %ymm7 2434; AVX-NEXT: vmovaps 256(%rdi), %ymm9 2435; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] 2436; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 2437; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] 2438; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 2439; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 2440; AVX-NEXT: vmovaps 160(%rdi), %ymm11 2441; AVX-NEXT: vmovaps 128(%rdi), %ymm12 2442; AVX-NEXT: vmovaps 192(%rdi), %ymm10 2443; AVX-NEXT: vmovaps 224(%rdi), %ymm13 2444; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] 2445; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2446; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[4],ymm11[4],ymm12[5],ymm11[5] 2447; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2448; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2449; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] 2450; AVX-NEXT: vmovaps 64(%rdi), %ymm1 2451; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2452; AVX-NEXT: vmovaps 96(%rdi), %ymm0 2453; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2454; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 2455; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 2456; AVX-NEXT: vmovaps (%rdi), %ymm1 2457; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2458; AVX-NEXT: vmovaps 32(%rdi), %ymm15 2459; AVX-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] 2460; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 2461; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,0] 2462; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] 2463; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2464; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2465; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] 2466; AVX-NEXT: vmovaps %ymm3, %ymm8 2467; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 2468; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2469; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,0],ymm3[1,0],ymm0[5,4],ymm3[5,4] 2470; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,0],ymm2[2,3],ymm4[6,4],ymm2[6,7] 2471; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 2472; AVX-NEXT: vunpcklps {{.*#+}} ymm4 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] 2473; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 2474; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm7[1,0],ymm9[1,0],ymm7[5,4],ymm9[5,4] 2475; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 2476; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm14[2,0],xmm4[2,3] 2477; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] 2478; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2479; AVX-NEXT: vunpcklps {{.*#+}} ymm5 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5] 2480; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm12[1,0],ymm11[5,4],ymm12[5,4] 2481; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,0],ymm5[2,3],ymm4[6,4],ymm5[6,7] 2482; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 2483; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 2484; AVX-NEXT: vunpcklps {{.*#+}} ymm4 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] 2485; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 2486; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 2487; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,0],ymm11[1,0],ymm15[5,4],ymm11[5,4] 2488; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 2489; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm14[2,0],xmm4[2,3] 2490; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] 2491; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2492; AVX-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm1[1],ymm8[3],ymm1[3] 2493; AVX-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] 2494; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm5[2,0],ymm4[4,5],ymm5[6,4] 2495; AVX-NEXT: vmovaps %ymm6, %ymm3 2496; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] 2497; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 2498; AVX-NEXT: vmovaps %ymm7, %ymm2 2499; AVX-NEXT: vunpckhps {{.*#+}} ymm14 = ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[6],ymm7[6],ymm9[7],ymm7[7] 2500; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 2501; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm14[0,1],xmm4[2,0] 2502; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm1[4,5,6,7] 2503; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 2504; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] 2505; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2506; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 2507; AVX-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[6],ymm1[6],ymm6[7],ymm1[7] 2508; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,0],ymm4[4,5],ymm0[6,4] 2509; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] 2510; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 2511; AVX-NEXT: vunpckhps {{.*#+}} ymm14 = ymm11[2],ymm15[2],ymm11[3],ymm15[3],ymm11[6],ymm15[6],ymm11[7],ymm15[7] 2512; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 2513; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm14[0,1],xmm4[2,0] 2514; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm4[0,1,2,3],ymm0[4,5,6,7] 2515; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2516; AVX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] 2517; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2518; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 2519; AVX-NEXT: # ymm4 = ymm4[3,0],mem[3,0],ymm4[7,4],mem[7,4] 2520; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm0[2,3],ymm4[6,4],ymm0[6,7] 2521; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload 2522; AVX-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] 2523; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm2[3,0],ymm9[3,0],ymm2[7,4],ymm9[7,4] 2524; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 2525; AVX-NEXT: vextractf128 $1, %ymm9, %xmm9 2526; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,0],xmm4[2,3] 2527; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] 2528; AVX-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm13[2],ymm7[3],ymm13[3],ymm7[6],ymm13[6],ymm7[7],ymm13[7] 2529; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,0],ymm6[3,0],ymm1[7,4],ymm6[7,4] 2530; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,0],ymm4[2,3],ymm6[6,4],ymm4[6,7] 2531; AVX-NEXT: vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] 2532; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,0],ymm11[3,0],ymm15[7,4],ymm11[7,4] 2533; AVX-NEXT: vextractf128 $1, %ymm3, %xmm2 2534; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 2535; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] 2536; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] 2537; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 2538; AVX-NEXT: vmovaps %ymm2, 32(%rsi) 2539; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 2540; AVX-NEXT: vmovaps %ymm2, (%rsi) 2541; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 2542; AVX-NEXT: vmovaps %ymm2, 32(%rdx) 2543; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 2544; AVX-NEXT: vmovaps %ymm2, (%rdx) 2545; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 2546; AVX-NEXT: vmovaps %ymm2, 32(%rcx) 2547; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 2548; AVX-NEXT: vmovaps %ymm2, (%rcx) 2549; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 2550; AVX-NEXT: vmovaps %ymm2, 32(%r8) 2551; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 2552; AVX-NEXT: vmovaps %ymm2, (%r8) 2553; AVX-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload 2554; AVX-NEXT: vmovaps %ymm2, 32(%r9) 2555; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 2556; AVX-NEXT: vmovaps %ymm2, (%r9) 2557; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 2558; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 2559; AVX-NEXT: vmovaps %ymm2, 32(%rax) 2560; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 2561; AVX-NEXT: vmovaps %ymm2, (%rax) 2562; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 2563; AVX-NEXT: vmovaps %ymm5, 32(%rax) 2564; AVX-NEXT: vmovaps %ymm14, (%rax) 2565; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 2566; AVX-NEXT: vmovaps %ymm0, 32(%rax) 2567; AVX-NEXT: vmovaps %ymm1, (%rax) 2568; AVX-NEXT: addq $584, %rsp # imm = 0x248 2569; AVX-NEXT: vzeroupper 2570; AVX-NEXT: retq 2571; 2572; AVX2-LABEL: load_i32_stride8_vf16: 2573; AVX2: # %bb.0: 2574; AVX2-NEXT: subq $456, %rsp # imm = 0x1C8 2575; AVX2-NEXT: vmovaps 288(%rdi), %xmm8 2576; AVX2-NEXT: vmovaps 256(%rdi), %xmm9 2577; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] 2578; AVX2-NEXT: vmovaps 352(%rdi), %xmm1 2579; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2580; AVX2-NEXT: vbroadcastss %xmm1, %xmm2 2581; AVX2-NEXT: vmovaps 320(%rdi), %xmm1 2582; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2583; AVX2-NEXT: vbroadcastss %xmm1, %xmm3 2584; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 2585; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] 2586; AVX2-NEXT: vmovaps 416(%rdi), %xmm1 2587; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2588; AVX2-NEXT: vmovaps 384(%rdi), %xmm2 2589; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2590; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 2591; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 2592; AVX2-NEXT: vmovaps 480(%rdi), %xmm1 2593; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2594; AVX2-NEXT: vbroadcastss %xmm1, %xmm3 2595; AVX2-NEXT: vmovaps 448(%rdi), %xmm1 2596; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2597; AVX2-NEXT: vbroadcastss %xmm1, %xmm10 2598; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] 2599; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 2600; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 2601; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 2602; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2603; AVX2-NEXT: vmovaps 160(%rdi), %xmm6 2604; AVX2-NEXT: vmovaps 128(%rdi), %xmm15 2605; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] 2606; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2607; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 2608; AVX2-NEXT: vmovaps 224(%rdi), %xmm4 2609; AVX2-NEXT: vbroadcastss %xmm4, %xmm3 2610; AVX2-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill 2611; AVX2-NEXT: vmovaps 192(%rdi), %xmm12 2612; AVX2-NEXT: vbroadcastss %xmm12, %xmm11 2613; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] 2614; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 2615; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] 2616; AVX2-NEXT: vmovaps 96(%rdi), %xmm5 2617; AVX2-NEXT: vbroadcastss %xmm5, %xmm2 2618; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2619; AVX2-NEXT: vmovaps 64(%rdi), %xmm13 2620; AVX2-NEXT: vbroadcastss %xmm13, %xmm3 2621; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 2622; AVX2-NEXT: vmovaps (%rdi), %xmm11 2623; AVX2-NEXT: vmovaps 32(%rdi), %xmm10 2624; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] 2625; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] 2626; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2627; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2628; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] 2629; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] 2630; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] 2631; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2632; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] 2633; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 2634; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm15[1,1,1,1] 2635; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3] 2636; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 2637; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] 2638; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2639; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2640; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1] 2641; AVX2-NEXT: vmovaps %xmm8, %xmm6 2642; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] 2643; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 2644; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 2645; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] 2646; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2647; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 2648; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2649; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 2650; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 2651; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 2652; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1] 2653; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2654; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] 2655; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 2656; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] 2657; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2658; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2659; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] 2660; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] 2661; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2] 2662; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] 2663; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 2664; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 2665; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] 2666; AVX2-NEXT: vmovaps %xmm8, %xmm7 2667; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,2,2,2] 2668; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3] 2669; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3] 2670; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 2671; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2672; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload 2673; AVX2-NEXT: # xmm4 = xmm15[2],mem[2],xmm15[3],mem[3] 2674; AVX2-NEXT: vmovaps (%rsp), %xmm9 # 16-byte Reload 2675; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] 2676; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3] 2677; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 2678; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 2679; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] 2680; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] 2681; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 2682; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] 2683; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] 2684; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3] 2685; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] 2686; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2687; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm7[2],xmm14[3],xmm7[3] 2688; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] 2689; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2690; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 2691; AVX2-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] 2692; AVX2-NEXT: vmovaps 96(%rdi), %ymm5 2693; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2694; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 2695; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] 2696; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 2697; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] 2698; AVX2-NEXT: vmovaps 64(%rdi), %ymm6 2699; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2700; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2701; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2702; AVX2-NEXT: vmovaps 32(%rdi), %ymm15 2703; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] 2704; AVX2-NEXT: vmovaps (%rdi), %ymm14 2705; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2706; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] 2707; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 2708; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 2709; AVX2-NEXT: vmovaps 224(%rdi), %ymm3 2710; AVX2-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill 2711; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] 2712; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 2713; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 2714; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2715; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] 2716; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 2717; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] 2718; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[2,2,2,2] 2719; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2720; AVX2-NEXT: vmovaps 192(%rdi), %ymm1 2721; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2722; AVX2-NEXT: vmovaps 160(%rdi), %ymm7 2723; AVX2-NEXT: vmovaps 128(%rdi), %ymm13 2724; AVX2-NEXT: vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[4],ymm7[4],ymm13[5],ymm7[5] 2725; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] 2726; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] 2727; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 2728; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2729; AVX2-NEXT: vmovaps 288(%rdi), %ymm9 2730; AVX2-NEXT: vmovaps 256(%rdi), %ymm8 2731; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] 2732; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 2733; AVX2-NEXT: vmovaps 352(%rdi), %ymm1 2734; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2735; AVX2-NEXT: vmovaps 320(%rdi), %ymm2 2736; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2737; AVX2-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 2738; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2] 2739; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm6[2,3] 2740; AVX2-NEXT: vmovaps 480(%rdi), %ymm0 2741; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2742; AVX2-NEXT: vmovaps 448(%rdi), %ymm12 2743; AVX2-NEXT: vmovaps 416(%rdi), %ymm6 2744; AVX2-NEXT: vmovaps 384(%rdi), %ymm10 2745; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] 2746; AVX2-NEXT: vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] 2747; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] 2748; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 2749; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2750; AVX2-NEXT: vbroadcastss 148(%rdi), %ymm0 2751; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] 2752; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] 2753; AVX2-NEXT: vextractf128 $1, %ymm11, %xmm2 2754; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1,1,1,5,5,5,5] 2755; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] 2756; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 2757; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] 2758; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 2759; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2760; AVX2-NEXT: vbroadcastss 404(%rdi), %ymm0 2761; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] 2762; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] 2763; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm1 2764; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1,1,1,5,5,5,5] 2765; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] 2766; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 2767; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2768; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] 2769; AVX2-NEXT: vbroadcastss 248(%rdi), %ymm0 2770; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2771; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 2772; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] 2773; AVX2-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] 2774; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2775; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload 2776; AVX2-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 2777; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm1 2778; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[2,2,2,2] 2779; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] 2780; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] 2781; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 2782; AVX2-NEXT: vbroadcastss 504(%rdi), %ymm0 2783; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] 2784; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] 2785; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] 2786; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 2787; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload 2788; AVX2-NEXT: # ymm7 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] 2789; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm8 2790; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm7[2,2,2,2] 2791; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] 2792; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] 2793; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] 2794; AVX2-NEXT: vbroadcastss 220(%rdi), %ymm8 2795; AVX2-NEXT: vunpckhps (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload 2796; AVX2-NEXT: # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] 2797; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] 2798; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 2799; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] 2800; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4 2801; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] 2802; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] 2803; AVX2-NEXT: vbroadcastss 476(%rdi), %ymm4 2804; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 2805; AVX2-NEXT: # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] 2806; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] 2807; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm4 2808; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7] 2809; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 2810; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] 2811; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 2812; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2813; AVX2-NEXT: vmovaps %ymm4, 32(%rsi) 2814; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2815; AVX2-NEXT: vmovaps %ymm4, (%rsi) 2816; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2817; AVX2-NEXT: vmovaps %ymm4, 32(%rdx) 2818; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2819; AVX2-NEXT: vmovaps %ymm4, (%rdx) 2820; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2821; AVX2-NEXT: vmovaps %ymm4, 32(%rcx) 2822; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2823; AVX2-NEXT: vmovaps %ymm4, (%rcx) 2824; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2825; AVX2-NEXT: vmovaps %ymm4, 32(%r8) 2826; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2827; AVX2-NEXT: vmovaps %ymm4, (%r8) 2828; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2829; AVX2-NEXT: vmovaps %ymm4, 32(%r9) 2830; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2831; AVX2-NEXT: vmovaps %ymm4, (%r9) 2832; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 2833; AVX2-NEXT: vmovaps %ymm11, 32(%rax) 2834; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2835; AVX2-NEXT: vmovaps %ymm4, (%rax) 2836; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 2837; AVX2-NEXT: vmovaps %ymm0, 32(%rax) 2838; AVX2-NEXT: vmovaps %ymm1, (%rax) 2839; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 2840; AVX2-NEXT: vmovaps %ymm3, 32(%rax) 2841; AVX2-NEXT: vmovaps %ymm2, (%rax) 2842; AVX2-NEXT: addq $456, %rsp # imm = 0x1C8 2843; AVX2-NEXT: vzeroupper 2844; AVX2-NEXT: retq 2845; 2846; AVX2-FP-LABEL: load_i32_stride8_vf16: 2847; AVX2-FP: # %bb.0: 2848; AVX2-FP-NEXT: subq $456, %rsp # imm = 0x1C8 2849; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm8 2850; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm9 2851; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] 2852; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm1 2853; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2854; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm2 2855; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm1 2856; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2857; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm3 2858; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 2859; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] 2860; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm1 2861; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2862; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm2 2863; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2864; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 2865; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 2866; AVX2-FP-NEXT: vmovaps 480(%rdi), %xmm1 2867; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2868; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm3 2869; AVX2-FP-NEXT: vmovaps 448(%rdi), %xmm1 2870; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2871; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm10 2872; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] 2873; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 2874; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 2875; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 2876; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2877; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm6 2878; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm15 2879; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] 2880; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2881; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 2882; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm4 2883; AVX2-FP-NEXT: vbroadcastss %xmm4, %xmm3 2884; AVX2-FP-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill 2885; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm12 2886; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm11 2887; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] 2888; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 2889; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] 2890; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm5 2891; AVX2-FP-NEXT: vbroadcastss %xmm5, %xmm2 2892; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2893; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm13 2894; AVX2-FP-NEXT: vbroadcastss %xmm13, %xmm3 2895; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 2896; AVX2-FP-NEXT: vmovaps (%rdi), %xmm11 2897; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm10 2898; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] 2899; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] 2900; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2901; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2902; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] 2903; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] 2904; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] 2905; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2906; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] 2907; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 2908; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm15[1,1,1,1] 2909; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3] 2910; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 2911; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] 2912; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2913; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2914; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1] 2915; AVX2-FP-NEXT: vmovaps %xmm8, %xmm6 2916; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] 2917; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 2918; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 2919; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] 2920; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2921; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 2922; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2923; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 2924; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 2925; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 2926; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1] 2927; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2928; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] 2929; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 2930; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] 2931; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2932; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2933; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] 2934; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] 2935; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2] 2936; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] 2937; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 2938; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 2939; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] 2940; AVX2-FP-NEXT: vmovaps %xmm8, %xmm7 2941; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,2,2,2] 2942; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3] 2943; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3] 2944; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 2945; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2946; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload 2947; AVX2-FP-NEXT: # xmm4 = xmm15[2],mem[2],xmm15[3],mem[3] 2948; AVX2-FP-NEXT: vmovaps (%rsp), %xmm9 # 16-byte Reload 2949; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] 2950; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3] 2951; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 2952; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 2953; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] 2954; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] 2955; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 2956; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] 2957; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] 2958; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3] 2959; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] 2960; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2961; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm7[2],xmm14[3],xmm7[3] 2962; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] 2963; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2964; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 2965; AVX2-FP-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] 2966; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm5 2967; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2968; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 2969; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] 2970; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 2971; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] 2972; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm6 2973; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2974; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2975; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2976; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm15 2977; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] 2978; AVX2-FP-NEXT: vmovaps (%rdi), %ymm14 2979; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2980; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] 2981; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 2982; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 2983; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm3 2984; AVX2-FP-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill 2985; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] 2986; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 2987; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 2988; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2989; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] 2990; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 2991; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] 2992; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[2,2,2,2] 2993; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2994; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm1 2995; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2996; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm7 2997; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm13 2998; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[4],ymm7[4],ymm13[5],ymm7[5] 2999; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] 3000; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] 3001; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 3002; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3003; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm9 3004; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm8 3005; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] 3006; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 3007; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm1 3008; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3009; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm2 3010; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3011; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 3012; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2] 3013; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm6[2,3] 3014; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm0 3015; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3016; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm12 3017; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm6 3018; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm10 3019; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] 3020; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] 3021; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] 3022; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 3023; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3024; AVX2-FP-NEXT: vbroadcastss 148(%rdi), %ymm0 3025; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] 3026; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] 3027; AVX2-FP-NEXT: vextractf128 $1, %ymm11, %xmm2 3028; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1,1,1,5,5,5,5] 3029; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] 3030; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 3031; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] 3032; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 3033; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3034; AVX2-FP-NEXT: vbroadcastss 404(%rdi), %ymm0 3035; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] 3036; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] 3037; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm1 3038; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1,1,1,5,5,5,5] 3039; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] 3040; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 3041; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 3042; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3043; AVX2-FP-NEXT: vbroadcastss 248(%rdi), %ymm0 3044; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3045; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 3046; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] 3047; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] 3048; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3049; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload 3050; AVX2-FP-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 3051; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm1 3052; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[2,2,2,2] 3053; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] 3054; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] 3055; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3056; AVX2-FP-NEXT: vbroadcastss 504(%rdi), %ymm0 3057; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] 3058; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] 3059; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] 3060; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3061; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload 3062; AVX2-FP-NEXT: # ymm7 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] 3063; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm8 3064; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm7[2,2,2,2] 3065; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] 3066; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] 3067; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] 3068; AVX2-FP-NEXT: vbroadcastss 220(%rdi), %ymm8 3069; AVX2-FP-NEXT: vunpckhps (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload 3070; AVX2-FP-NEXT: # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] 3071; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] 3072; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 3073; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] 3074; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4 3075; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] 3076; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] 3077; AVX2-FP-NEXT: vbroadcastss 476(%rdi), %ymm4 3078; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 3079; AVX2-FP-NEXT: # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] 3080; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] 3081; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm4 3082; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7] 3083; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 3084; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] 3085; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 3086; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3087; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi) 3088; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3089; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) 3090; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3091; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx) 3092; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3093; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx) 3094; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3095; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx) 3096; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3097; AVX2-FP-NEXT: vmovaps %ymm4, (%rcx) 3098; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3099; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r8) 3100; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3101; AVX2-FP-NEXT: vmovaps %ymm4, (%r8) 3102; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3103; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r9) 3104; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3105; AVX2-FP-NEXT: vmovaps %ymm4, (%r9) 3106; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3107; AVX2-FP-NEXT: vmovaps %ymm11, 32(%rax) 3108; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3109; AVX2-FP-NEXT: vmovaps %ymm4, (%rax) 3110; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3111; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) 3112; AVX2-FP-NEXT: vmovaps %ymm1, (%rax) 3113; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3114; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rax) 3115; AVX2-FP-NEXT: vmovaps %ymm2, (%rax) 3116; AVX2-FP-NEXT: addq $456, %rsp # imm = 0x1C8 3117; AVX2-FP-NEXT: vzeroupper 3118; AVX2-FP-NEXT: retq 3119; 3120; AVX2-FCP-LABEL: load_i32_stride8_vf16: 3121; AVX2-FCP: # %bb.0: 3122; AVX2-FCP-NEXT: subq $456, %rsp # imm = 0x1C8 3123; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm8 3124; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm9 3125; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] 3126; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm1 3127; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3128; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm2 3129; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm1 3130; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3131; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm3 3132; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 3133; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] 3134; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm1 3135; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3136; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm2 3137; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3138; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 3139; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 3140; AVX2-FCP-NEXT: vmovaps 480(%rdi), %xmm1 3141; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3142; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm3 3143; AVX2-FCP-NEXT: vmovaps 448(%rdi), %xmm1 3144; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3145; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm10 3146; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] 3147; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 3148; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 3149; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 3150; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3151; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm6 3152; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm15 3153; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] 3154; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3155; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 3156; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm4 3157; AVX2-FCP-NEXT: vbroadcastss %xmm4, %xmm3 3158; AVX2-FCP-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill 3159; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm12 3160; AVX2-FCP-NEXT: vbroadcastss %xmm12, %xmm11 3161; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] 3162; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 3163; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] 3164; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm5 3165; AVX2-FCP-NEXT: vbroadcastss %xmm5, %xmm2 3166; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3167; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm13 3168; AVX2-FCP-NEXT: vbroadcastss %xmm13, %xmm3 3169; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 3170; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm11 3171; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm10 3172; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] 3173; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] 3174; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3175; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3176; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] 3177; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] 3178; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] 3179; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 3180; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] 3181; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 3182; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm7 = xmm15[1,1,1,1] 3183; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3] 3184; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 3185; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] 3186; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3187; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3188; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1] 3189; AVX2-FCP-NEXT: vmovaps %xmm8, %xmm6 3190; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] 3191; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3192; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 3193; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] 3194; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 3195; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 3196; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 3197; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 3198; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 3199; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3200; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1] 3201; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 3202; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] 3203; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 3204; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] 3205; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3206; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3207; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] 3208; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] 3209; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2] 3210; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] 3211; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 3212; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 3213; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] 3214; AVX2-FCP-NEXT: vmovaps %xmm8, %xmm7 3215; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,2,2,2] 3216; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3] 3217; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3] 3218; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 3219; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3220; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload 3221; AVX2-FCP-NEXT: # xmm4 = xmm15[2],mem[2],xmm15[3],mem[3] 3222; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm9 # 16-byte Reload 3223; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] 3224; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3] 3225; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 3226; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 3227; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] 3228; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] 3229; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 3230; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] 3231; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] 3232; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3] 3233; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] 3234; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3235; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm7[2],xmm14[3],xmm7[3] 3236; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] 3237; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 3238; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 3239; AVX2-FCP-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] 3240; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm5 3241; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3242; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 3243; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] 3244; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 3245; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] 3246; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm6 3247; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3248; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3249; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3250; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm15 3251; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] 3252; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm14 3253; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 3254; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] 3255; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 3256; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 3257; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm3 3258; AVX2-FCP-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill 3259; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] 3260; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 3261; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3262; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3263; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] 3264; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 3265; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] 3266; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[2,2,2,2] 3267; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 3268; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm1 3269; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3270; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm7 3271; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm13 3272; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[4],ymm7[4],ymm13[5],ymm7[5] 3273; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] 3274; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] 3275; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 3276; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3277; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm9 3278; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm8 3279; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] 3280; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 3281; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm1 3282; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3283; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm2 3284; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3285; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 3286; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2] 3287; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm6[2,3] 3288; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm0 3289; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3290; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm12 3291; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm6 3292; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm10 3293; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] 3294; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] 3295; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] 3296; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 3297; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3298; AVX2-FCP-NEXT: vbroadcastss 148(%rdi), %ymm0 3299; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] 3300; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] 3301; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm2 3302; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1,1,1,5,5,5,5] 3303; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] 3304; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 3305; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] 3306; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 3307; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3308; AVX2-FCP-NEXT: vbroadcastss 404(%rdi), %ymm0 3309; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] 3310; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] 3311; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm1 3312; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1,1,1,5,5,5,5] 3313; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] 3314; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 3315; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 3316; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3317; AVX2-FCP-NEXT: vbroadcastss 248(%rdi), %ymm0 3318; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3319; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 3320; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] 3321; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] 3322; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3323; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload 3324; AVX2-FCP-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 3325; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm1 3326; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[2,2,2,2] 3327; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] 3328; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] 3329; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3330; AVX2-FCP-NEXT: vbroadcastss 504(%rdi), %ymm0 3331; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] 3332; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] 3333; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] 3334; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3335; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload 3336; AVX2-FCP-NEXT: # ymm7 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] 3337; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm8 3338; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm7[2,2,2,2] 3339; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] 3340; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] 3341; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] 3342; AVX2-FCP-NEXT: vbroadcastss 220(%rdi), %ymm8 3343; AVX2-FCP-NEXT: vunpckhps (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload 3344; AVX2-FCP-NEXT: # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] 3345; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] 3346; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 3347; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] 3348; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm4 3349; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] 3350; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] 3351; AVX2-FCP-NEXT: vbroadcastss 476(%rdi), %ymm4 3352; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 3353; AVX2-FCP-NEXT: # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] 3354; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] 3355; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm4 3356; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7] 3357; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 3358; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] 3359; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 3360; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3361; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi) 3362; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3363; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) 3364; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3365; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rdx) 3366; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3367; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx) 3368; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3369; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rcx) 3370; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3371; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx) 3372; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3373; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r8) 3374; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3375; AVX2-FCP-NEXT: vmovaps %ymm4, (%r8) 3376; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3377; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r9) 3378; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3379; AVX2-FCP-NEXT: vmovaps %ymm4, (%r9) 3380; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3381; AVX2-FCP-NEXT: vmovaps %ymm11, 32(%rax) 3382; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3383; AVX2-FCP-NEXT: vmovaps %ymm4, (%rax) 3384; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3385; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) 3386; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax) 3387; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3388; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rax) 3389; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax) 3390; AVX2-FCP-NEXT: addq $456, %rsp # imm = 0x1C8 3391; AVX2-FCP-NEXT: vzeroupper 3392; AVX2-FCP-NEXT: retq 3393; 3394; AVX512-LABEL: load_i32_stride8_vf16: 3395; AVX512: # %bb.0: 3396; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 3397; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 3398; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 3399; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 3400; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 3401; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 3402; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm4 3403; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm5 3404; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm3 3405; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm7 3406; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm6 3407; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 3408; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3409; AVX512-NEXT: vmovdqa64 %zmm6, %zmm9 3410; AVX512-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 3411; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 3412; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 3413; AVX512-NEXT: movb $-64, %dil 3414; AVX512-NEXT: kmovw %edi, %k1 3415; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} 3416; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 3417; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 3418; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 3419; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] 3420; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 3421; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 3422; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3423; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10 3424; AVX512-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 3425; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11 3426; AVX512-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 3427; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} 3428; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 3429; AVX512-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 3430; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 3431; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] 3432; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 3433; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 3434; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3435; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11 3436; AVX512-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 3437; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 3438; AVX512-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 3439; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} 3440; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 3441; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 3442; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 3443; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] 3444; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 3445; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 3446; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3447; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 3448; AVX512-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 3449; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 3450; AVX512-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 3451; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} 3452; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 3453; AVX512-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 3454; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 3455; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] 3456; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 3457; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 3458; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3459; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 3460; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 3461; AVX512-NEXT: vmovdqa64 %zmm3, %zmm14 3462; AVX512-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 3463; AVX512-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} 3464; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 3465; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 3466; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 3467; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] 3468; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 3469; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 3470; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3471; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 3472; AVX512-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 3473; AVX512-NEXT: vmovdqa64 %zmm3, %zmm15 3474; AVX512-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 3475; AVX512-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} 3476; AVX512-NEXT: vmovdqa64 %zmm1, %zmm14 3477; AVX512-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 3478; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 3479; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] 3480; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 3481; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 3482; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3483; AVX512-NEXT: vmovdqa64 %zmm6, %zmm15 3484; AVX512-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 3485; AVX512-NEXT: vmovdqa64 %zmm3, %zmm16 3486; AVX512-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 3487; AVX512-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} 3488; AVX512-NEXT: vmovdqa64 %zmm1, %zmm15 3489; AVX512-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 3490; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 3491; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] 3492; AVX512-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 3493; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 3494; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3495; AVX512-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 3496; AVX512-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 3497; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} 3498; AVX512-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 3499; AVX512-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 3500; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3501; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 3502; AVX512-NEXT: vmovdqa64 %zmm8, (%rsi) 3503; AVX512-NEXT: vmovdqa64 %zmm9, (%rdx) 3504; AVX512-NEXT: vmovdqa64 %zmm10, (%rcx) 3505; AVX512-NEXT: vmovdqa64 %zmm11, (%r8) 3506; AVX512-NEXT: vmovdqa64 %zmm12, (%r9) 3507; AVX512-NEXT: vmovdqa64 %zmm13, (%r11) 3508; AVX512-NEXT: vmovdqa64 %zmm14, (%r10) 3509; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) 3510; AVX512-NEXT: vzeroupper 3511; AVX512-NEXT: retq 3512; 3513; AVX512-FCP-LABEL: load_i32_stride8_vf16: 3514; AVX512-FCP: # %bb.0: 3515; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3516; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 3517; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 3518; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 3519; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 3520; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 3521; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 3522; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 3523; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 3524; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 3525; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 3526; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 3527; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3528; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 3529; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 3530; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 3531; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 3532; AVX512-FCP-NEXT: movb $-64, %dil 3533; AVX512-FCP-NEXT: kmovw %edi, %k1 3534; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} 3535; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 3536; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 3537; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 3538; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] 3539; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 3540; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 3541; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3542; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 3543; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 3544; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 3545; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 3546; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} 3547; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 3548; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 3549; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 3550; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] 3551; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 3552; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 3553; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3554; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 3555; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 3556; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 3557; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 3558; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} 3559; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 3560; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 3561; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 3562; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] 3563; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 3564; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 3565; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3566; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 3567; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 3568; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 3569; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 3570; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} 3571; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 3572; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 3573; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 3574; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] 3575; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 3576; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 3577; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3578; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 3579; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 3580; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 3581; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 3582; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} 3583; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 3584; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 3585; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 3586; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] 3587; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 3588; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 3589; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3590; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 3591; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 3592; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 3593; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 3594; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} 3595; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 3596; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 3597; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 3598; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] 3599; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 3600; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 3601; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3602; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 3603; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 3604; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 3605; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 3606; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} 3607; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 3608; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 3609; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 3610; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] 3611; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 3612; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 3613; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3614; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 3615; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 3616; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} 3617; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 3618; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 3619; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3620; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 3621; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) 3622; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) 3623; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) 3624; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%r8) 3625; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%r9) 3626; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%r11) 3627; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%r10) 3628; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 3629; AVX512-FCP-NEXT: vzeroupper 3630; AVX512-FCP-NEXT: retq 3631; 3632; AVX512DQ-LABEL: load_i32_stride8_vf16: 3633; AVX512DQ: # %bb.0: 3634; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 3635; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 3636; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 3637; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 3638; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2 3639; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1 3640; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm4 3641; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm5 3642; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm3 3643; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm7 3644; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm6 3645; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 3646; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3647; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm9 3648; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 3649; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 3650; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 3651; AVX512DQ-NEXT: movb $-64, %dil 3652; AVX512DQ-NEXT: kmovw %edi, %k1 3653; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} 3654; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 3655; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 3656; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 3657; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] 3658; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 3659; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 3660; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3661; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10 3662; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 3663; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm11 3664; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 3665; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} 3666; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 3667; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 3668; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 3669; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] 3670; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 3671; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 3672; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3673; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11 3674; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 3675; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 3676; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 3677; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} 3678; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 3679; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 3680; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 3681; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] 3682; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 3683; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 3684; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3685; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 3686; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 3687; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 3688; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 3689; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} 3690; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 3691; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 3692; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 3693; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] 3694; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 3695; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 3696; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3697; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 3698; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 3699; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm14 3700; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 3701; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} 3702; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 3703; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 3704; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 3705; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] 3706; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 3707; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 3708; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3709; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14 3710; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 3711; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm15 3712; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 3713; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} 3714; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm14 3715; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 3716; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 3717; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] 3718; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 3719; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 3720; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3721; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm15 3722; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 3723; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm16 3724; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 3725; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} 3726; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm15 3727; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 3728; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 3729; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] 3730; AVX512DQ-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 3731; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 3732; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3733; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 3734; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 3735; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} 3736; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 3737; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 3738; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3739; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 3740; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi) 3741; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx) 3742; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rcx) 3743; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%r8) 3744; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%r9) 3745; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%r11) 3746; AVX512DQ-NEXT: vmovdqa64 %zmm14, (%r10) 3747; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) 3748; AVX512DQ-NEXT: vzeroupper 3749; AVX512DQ-NEXT: retq 3750; 3751; AVX512DQ-FCP-LABEL: load_i32_stride8_vf16: 3752; AVX512DQ-FCP: # %bb.0: 3753; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3754; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 3755; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 3756; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 3757; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 3758; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 3759; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 3760; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 3761; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 3762; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 3763; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 3764; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 3765; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3766; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 3767; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 3768; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 3769; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 3770; AVX512DQ-FCP-NEXT: movb $-64, %dil 3771; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 3772; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} 3773; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 3774; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 3775; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 3776; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] 3777; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 3778; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 3779; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3780; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 3781; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 3782; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 3783; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 3784; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} 3785; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 3786; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 3787; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 3788; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] 3789; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 3790; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 3791; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3792; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 3793; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 3794; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 3795; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 3796; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} 3797; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 3798; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 3799; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 3800; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] 3801; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 3802; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 3803; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3804; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 3805; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 3806; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 3807; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 3808; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} 3809; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 3810; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 3811; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 3812; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] 3813; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 3814; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 3815; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3816; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 3817; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 3818; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 3819; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 3820; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} 3821; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 3822; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 3823; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 3824; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] 3825; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 3826; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 3827; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3828; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 3829; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 3830; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 3831; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 3832; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} 3833; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 3834; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 3835; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 3836; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] 3837; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 3838; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 3839; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3840; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 3841; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 3842; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 3843; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 3844; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} 3845; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 3846; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 3847; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 3848; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] 3849; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 3850; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 3851; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3852; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 3853; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 3854; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} 3855; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 3856; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 3857; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3858; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 3859; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) 3860; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) 3861; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) 3862; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%r8) 3863; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%r9) 3864; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%r11) 3865; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%r10) 3866; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 3867; AVX512DQ-FCP-NEXT: vzeroupper 3868; AVX512DQ-FCP-NEXT: retq 3869; 3870; AVX512BW-LABEL: load_i32_stride8_vf16: 3871; AVX512BW: # %bb.0: 3872; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 3873; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 3874; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 3875; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 3876; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 3877; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 3878; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 3879; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 3880; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 3881; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 3882; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 3883; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 3884; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3885; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 3886; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 3887; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 3888; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 3889; AVX512BW-NEXT: movb $-64, %dil 3890; AVX512BW-NEXT: kmovd %edi, %k1 3891; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} 3892; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 3893; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 3894; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 3895; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] 3896; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 3897; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 3898; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3899; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 3900; AVX512BW-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 3901; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 3902; AVX512BW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 3903; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} 3904; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 3905; AVX512BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 3906; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 3907; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] 3908; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 3909; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 3910; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3911; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 3912; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 3913; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 3914; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 3915; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} 3916; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 3917; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 3918; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 3919; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] 3920; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 3921; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 3922; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3923; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 3924; AVX512BW-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 3925; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 3926; AVX512BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 3927; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} 3928; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 3929; AVX512BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 3930; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 3931; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] 3932; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 3933; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 3934; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3935; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 3936; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 3937; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 3938; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 3939; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} 3940; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 3941; AVX512BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 3942; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 3943; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] 3944; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 3945; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 3946; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3947; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 3948; AVX512BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 3949; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 3950; AVX512BW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 3951; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} 3952; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 3953; AVX512BW-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 3954; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 3955; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] 3956; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 3957; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 3958; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3959; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 3960; AVX512BW-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 3961; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 3962; AVX512BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 3963; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} 3964; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 3965; AVX512BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 3966; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 3967; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] 3968; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 3969; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 3970; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3971; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 3972; AVX512BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 3973; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} 3974; AVX512BW-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 3975; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 3976; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3977; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 3978; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) 3979; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) 3980; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx) 3981; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r8) 3982; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r9) 3983; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r11) 3984; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r10) 3985; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) 3986; AVX512BW-NEXT: vzeroupper 3987; AVX512BW-NEXT: retq 3988; 3989; AVX512BW-FCP-LABEL: load_i32_stride8_vf16: 3990; AVX512BW-FCP: # %bb.0: 3991; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3992; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 3993; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 3994; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 3995; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 3996; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 3997; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 3998; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 3999; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 4000; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 4001; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 4002; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 4003; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4004; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 4005; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 4006; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 4007; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 4008; AVX512BW-FCP-NEXT: movb $-64, %dil 4009; AVX512BW-FCP-NEXT: kmovd %edi, %k1 4010; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} 4011; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 4012; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 4013; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 4014; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] 4015; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 4016; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 4017; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4018; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 4019; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 4020; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 4021; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 4022; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} 4023; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 4024; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 4025; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 4026; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] 4027; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 4028; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 4029; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4030; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 4031; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 4032; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 4033; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 4034; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} 4035; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 4036; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 4037; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 4038; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] 4039; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 4040; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 4041; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4042; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 4043; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 4044; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 4045; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 4046; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} 4047; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 4048; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 4049; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 4050; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] 4051; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 4052; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 4053; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4054; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 4055; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 4056; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 4057; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 4058; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} 4059; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 4060; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 4061; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 4062; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] 4063; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 4064; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 4065; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4066; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 4067; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 4068; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 4069; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 4070; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} 4071; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 4072; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 4073; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 4074; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] 4075; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 4076; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 4077; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4078; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 4079; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 4080; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 4081; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 4082; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} 4083; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 4084; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 4085; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 4086; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] 4087; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 4088; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 4089; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4090; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 4091; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 4092; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} 4093; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 4094; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 4095; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4096; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 4097; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) 4098; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) 4099; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) 4100; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r8) 4101; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r9) 4102; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r11) 4103; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%r10) 4104; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 4105; AVX512BW-FCP-NEXT: vzeroupper 4106; AVX512BW-FCP-NEXT: retq 4107; 4108; AVX512DQ-BW-LABEL: load_i32_stride8_vf16: 4109; AVX512DQ-BW: # %bb.0: 4110; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 4111; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 4112; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 4113; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 4114; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 4115; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 4116; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4 4117; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm5 4118; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm3 4119; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm7 4120; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm6 4121; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 4122; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4123; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 4124; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 4125; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 4126; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 4127; AVX512DQ-BW-NEXT: movb $-64, %dil 4128; AVX512DQ-BW-NEXT: kmovd %edi, %k1 4129; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} 4130; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 4131; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 4132; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 4133; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] 4134; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 4135; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 4136; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4137; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 4138; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 4139; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11 4140; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 4141; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} 4142; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 4143; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 4144; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 4145; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] 4146; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 4147; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 4148; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4149; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 4150; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 4151; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 4152; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 4153; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} 4154; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 4155; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 4156; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 4157; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] 4158; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 4159; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 4160; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4161; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 4162; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 4163; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 4164; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 4165; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} 4166; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12 4167; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 4168; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 4169; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] 4170; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 4171; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 4172; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4173; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 4174; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 4175; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm14 4176; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 4177; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} 4178; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 4179; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 4180; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 4181; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] 4182; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 4183; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 4184; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4185; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14 4186; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 4187; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm15 4188; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 4189; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} 4190; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm14 4191; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 4192; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 4193; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] 4194; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 4195; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 4196; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4197; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm15 4198; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 4199; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm16 4200; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 4201; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} 4202; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15 4203; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 4204; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 4205; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] 4206; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 4207; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 4208; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4209; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 4210; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 4211; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} 4212; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 4213; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 4214; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4215; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 4216; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi) 4217; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rdx) 4218; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rcx) 4219; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%r8) 4220; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%r9) 4221; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%r11) 4222; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%r10) 4223; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) 4224; AVX512DQ-BW-NEXT: vzeroupper 4225; AVX512DQ-BW-NEXT: retq 4226; 4227; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf16: 4228; AVX512DQ-BW-FCP: # %bb.0: 4229; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4230; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 4231; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 4232; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 4233; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 4234; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 4235; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 4236; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 4237; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 4238; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 4239; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 4240; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 4241; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4242; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 4243; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 4244; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 4245; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 4246; AVX512DQ-BW-FCP-NEXT: movb $-64, %dil 4247; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 4248; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} 4249; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 4250; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 4251; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 4252; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] 4253; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 4254; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 4255; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4256; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 4257; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 4258; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 4259; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 4260; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} 4261; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 4262; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 4263; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 4264; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] 4265; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 4266; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 4267; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4268; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 4269; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 4270; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 4271; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 4272; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} 4273; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 4274; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 4275; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 4276; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] 4277; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 4278; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 4279; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4280; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 4281; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 4282; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 4283; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 4284; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} 4285; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 4286; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 4287; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 4288; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] 4289; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 4290; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 4291; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4292; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 4293; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 4294; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 4295; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 4296; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} 4297; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 4298; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 4299; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 4300; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] 4301; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 4302; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 4303; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4304; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 4305; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 4306; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 4307; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 4308; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} 4309; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 4310; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 4311; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 4312; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] 4313; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 4314; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 4315; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4316; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 4317; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 4318; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 4319; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 4320; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} 4321; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 4322; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 4323; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 4324; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] 4325; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 4326; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 4327; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4328; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 4329; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 4330; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} 4331; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 4332; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 4333; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4334; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 4335; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) 4336; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) 4337; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) 4338; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r8) 4339; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r9) 4340; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r11) 4341; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%r10) 4342; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 4343; AVX512DQ-BW-FCP-NEXT: vzeroupper 4344; AVX512DQ-BW-FCP-NEXT: retq 4345 %wide.vec = load <128 x i32>, ptr %in.vec, align 64 4346 %strided.vec0 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120> 4347 %strided.vec1 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121> 4348 %strided.vec2 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122> 4349 %strided.vec3 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123> 4350 %strided.vec4 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124> 4351 %strided.vec5 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125> 4352 %strided.vec6 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126> 4353 %strided.vec7 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127> 4354 store <16 x i32> %strided.vec0, ptr %out.vec0, align 64 4355 store <16 x i32> %strided.vec1, ptr %out.vec1, align 64 4356 store <16 x i32> %strided.vec2, ptr %out.vec2, align 64 4357 store <16 x i32> %strided.vec3, ptr %out.vec3, align 64 4358 store <16 x i32> %strided.vec4, ptr %out.vec4, align 64 4359 store <16 x i32> %strided.vec5, ptr %out.vec5, align 64 4360 store <16 x i32> %strided.vec6, ptr %out.vec6, align 64 4361 store <16 x i32> %strided.vec7, ptr %out.vec7, align 64 4362 ret void 4363} 4364 4365define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { 4366; SSE-LABEL: load_i32_stride8_vf32: 4367; SSE: # %bb.0: 4368; SSE-NEXT: subq $952, %rsp # imm = 0x3B8 4369; SSE-NEXT: movaps 544(%rdi), %xmm5 4370; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4371; SSE-NEXT: movaps 608(%rdi), %xmm6 4372; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4373; SSE-NEXT: movaps 576(%rdi), %xmm7 4374; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4375; SSE-NEXT: movaps 672(%rdi), %xmm8 4376; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4377; SSE-NEXT: movaps 640(%rdi), %xmm4 4378; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4379; SSE-NEXT: movaps 736(%rdi), %xmm9 4380; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4381; SSE-NEXT: movaps 704(%rdi), %xmm3 4382; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4383; SSE-NEXT: movaps 160(%rdi), %xmm10 4384; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4385; SSE-NEXT: movaps 128(%rdi), %xmm1 4386; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4387; SSE-NEXT: movaps 224(%rdi), %xmm2 4388; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4389; SSE-NEXT: movaps 192(%rdi), %xmm0 4390; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4391; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 4392; SSE-NEXT: movaps %xmm1, %xmm2 4393; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] 4394; SSE-NEXT: movaps %xmm2, %xmm1 4395; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 4396; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4397; SSE-NEXT: movaps %xmm3, %xmm1 4398; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] 4399; SSE-NEXT: movaps %xmm4, %xmm3 4400; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] 4401; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 4402; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4403; SSE-NEXT: movaps %xmm3, %xmm0 4404; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4405; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4406; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] 4407; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4408; SSE-NEXT: movaps %xmm7, %xmm0 4409; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 4410; SSE-NEXT: movaps 512(%rdi), %xmm1 4411; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4412; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] 4413; SSE-NEXT: movaps %xmm1, %xmm2 4414; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 4415; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4416; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 4417; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4418; SSE-NEXT: movaps 480(%rdi), %xmm1 4419; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4420; SSE-NEXT: movaps 448(%rdi), %xmm10 4421; SSE-NEXT: movaps %xmm10, %xmm0 4422; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4423; SSE-NEXT: movaps 416(%rdi), %xmm3 4424; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4425; SSE-NEXT: movaps 384(%rdi), %xmm2 4426; SSE-NEXT: movaps %xmm2, %xmm1 4427; SSE-NEXT: movaps %xmm2, %xmm14 4428; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 4429; SSE-NEXT: movaps %xmm1, %xmm2 4430; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 4431; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4432; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 4433; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4434; SSE-NEXT: movaps 992(%rdi), %xmm1 4435; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4436; SSE-NEXT: movaps 960(%rdi), %xmm15 4437; SSE-NEXT: movaps %xmm15, %xmm0 4438; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4439; SSE-NEXT: movaps 928(%rdi), %xmm2 4440; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4441; SSE-NEXT: movaps 896(%rdi), %xmm1 4442; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4443; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 4444; SSE-NEXT: movaps %xmm1, %xmm2 4445; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 4446; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4447; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 4448; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4449; SSE-NEXT: movaps 352(%rdi), %xmm1 4450; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4451; SSE-NEXT: movaps 320(%rdi), %xmm12 4452; SSE-NEXT: movaps %xmm12, %xmm0 4453; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4454; SSE-NEXT: movaps 288(%rdi), %xmm3 4455; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4456; SSE-NEXT: movaps 256(%rdi), %xmm1 4457; SSE-NEXT: movaps %xmm1, %xmm2 4458; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 4459; SSE-NEXT: movaps %xmm2, %xmm3 4460; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] 4461; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4462; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 4463; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4464; SSE-NEXT: movaps 864(%rdi), %xmm2 4465; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4466; SSE-NEXT: movaps 832(%rdi), %xmm11 4467; SSE-NEXT: movaps %xmm11, %xmm0 4468; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 4469; SSE-NEXT: movaps 800(%rdi), %xmm4 4470; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill 4471; SSE-NEXT: movaps 768(%rdi), %xmm2 4472; SSE-NEXT: movaps %xmm2, %xmm3 4473; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 4474; SSE-NEXT: movaps %xmm3, %xmm4 4475; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] 4476; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4477; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 4478; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4479; SSE-NEXT: movaps 96(%rdi), %xmm6 4480; SSE-NEXT: movaps 64(%rdi), %xmm9 4481; SSE-NEXT: movaps %xmm9, %xmm13 4482; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] 4483; SSE-NEXT: movaps (%rdi), %xmm8 4484; SSE-NEXT: movaps 32(%rdi), %xmm3 4485; SSE-NEXT: movaps %xmm8, %xmm7 4486; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] 4487; SSE-NEXT: movaps %xmm7, %xmm5 4488; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm13[0] 4489; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4490; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1] 4491; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4492; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4493; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 4494; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] 4495; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 4496; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 4497; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] 4498; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 4499; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 4500; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] 4501; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4502; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 4503; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] 4504; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4505; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4506; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 4507; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] 4508; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4509; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4510; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 4511; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 4512; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] 4513; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 4514; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] 4515; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4516; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 4517; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] 4518; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 4519; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 4520; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] 4521; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 4522; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] 4523; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4524; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] 4525; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 4526; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] 4527; SSE-NEXT: unpckhps (%rsp), %xmm2 # 16-byte Folded Reload 4528; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] 4529; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] 4530; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] 4531; SSE-NEXT: movaps %xmm5, %xmm3 4532; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] 4533; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4534; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1] 4535; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4536; SSE-NEXT: movaps %xmm1, %xmm3 4537; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0] 4538; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4539; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] 4540; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4541; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4542; SSE-NEXT: movaps %xmm6, %xmm1 4543; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] 4544; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4545; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1] 4546; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4547; SSE-NEXT: movaps %xmm0, %xmm3 4548; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] 4549; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4550; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] 4551; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4552; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4553; SSE-NEXT: movaps %xmm1, %xmm3 4554; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0] 4555; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4556; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] 4557; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4558; SSE-NEXT: movaps %xmm2, %xmm1 4559; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] 4560; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4561; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] 4562; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4563; SSE-NEXT: movaps %xmm14, %xmm1 4564; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] 4565; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4566; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1] 4567; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4568; SSE-NEXT: movaps %xmm8, %xmm1 4569; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] 4570; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4571; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] 4572; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4573; SSE-NEXT: movaps 240(%rdi), %xmm2 4574; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4575; SSE-NEXT: movaps 208(%rdi), %xmm12 4576; SSE-NEXT: movaps %xmm12, %xmm0 4577; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 4578; SSE-NEXT: movaps 176(%rdi), %xmm13 4579; SSE-NEXT: movaps 144(%rdi), %xmm2 4580; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4581; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] 4582; SSE-NEXT: movaps %xmm2, %xmm1 4583; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 4584; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4585; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 4586; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4587; SSE-NEXT: movaps 368(%rdi), %xmm4 4588; SSE-NEXT: movaps 336(%rdi), %xmm1 4589; SSE-NEXT: movaps %xmm1, %xmm0 4590; SSE-NEXT: movaps %xmm1, %xmm9 4591; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 4592; SSE-NEXT: movaps 304(%rdi), %xmm5 4593; SSE-NEXT: movaps 272(%rdi), %xmm8 4594; SSE-NEXT: movaps %xmm8, %xmm1 4595; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] 4596; SSE-NEXT: movaps %xmm1, %xmm2 4597; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 4598; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4599; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 4600; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill 4601; SSE-NEXT: movaps 496(%rdi), %xmm7 4602; SSE-NEXT: movaps 464(%rdi), %xmm0 4603; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4604; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 4605; SSE-NEXT: movaps 432(%rdi), %xmm2 4606; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4607; SSE-NEXT: movaps 400(%rdi), %xmm1 4608; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4609; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 4610; SSE-NEXT: movaps %xmm1, %xmm2 4611; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 4612; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4613; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 4614; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4615; SSE-NEXT: movaps 624(%rdi), %xmm2 4616; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4617; SSE-NEXT: movaps 592(%rdi), %xmm1 4618; SSE-NEXT: movaps %xmm1, %xmm0 4619; SSE-NEXT: movaps %xmm1, %xmm3 4620; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 4621; SSE-NEXT: movaps 560(%rdi), %xmm6 4622; SSE-NEXT: movaps 528(%rdi), %xmm1 4623; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4624; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] 4625; SSE-NEXT: movaps %xmm1, %xmm2 4626; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 4627; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4628; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 4629; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4630; SSE-NEXT: movaps 752(%rdi), %xmm1 4631; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4632; SSE-NEXT: movaps 720(%rdi), %xmm0 4633; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4634; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4635; SSE-NEXT: movaps 688(%rdi), %xmm2 4636; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4637; SSE-NEXT: movaps 656(%rdi), %xmm1 4638; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4639; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 4640; SSE-NEXT: movaps %xmm1, %xmm2 4641; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 4642; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4643; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 4644; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4645; SSE-NEXT: movaps 880(%rdi), %xmm1 4646; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4647; SSE-NEXT: movaps 848(%rdi), %xmm14 4648; SSE-NEXT: movaps %xmm14, %xmm0 4649; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4650; SSE-NEXT: movaps 816(%rdi), %xmm1 4651; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4652; SSE-NEXT: movaps 784(%rdi), %xmm2 4653; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4654; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4655; SSE-NEXT: movaps %xmm2, %xmm1 4656; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 4657; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4658; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 4659; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4660; SSE-NEXT: movaps 1008(%rdi), %xmm1 4661; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4662; SSE-NEXT: movaps 976(%rdi), %xmm11 4663; SSE-NEXT: movaps %xmm11, %xmm0 4664; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4665; SSE-NEXT: movaps 944(%rdi), %xmm1 4666; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4667; SSE-NEXT: movaps 912(%rdi), %xmm2 4668; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4669; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4670; SSE-NEXT: movaps %xmm2, %xmm1 4671; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 4672; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4673; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 4674; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4675; SSE-NEXT: movaps 112(%rdi), %xmm1 4676; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4677; SSE-NEXT: movaps 80(%rdi), %xmm10 4678; SSE-NEXT: movaps %xmm10, %xmm0 4679; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4680; SSE-NEXT: movaps 16(%rdi), %xmm15 4681; SSE-NEXT: movaps 48(%rdi), %xmm1 4682; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4683; SSE-NEXT: movaps %xmm15, %xmm2 4684; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4685; SSE-NEXT: movaps %xmm2, %xmm1 4686; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 4687; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4688; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 4689; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4690; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 4691; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] 4692; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4693; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] 4694; SSE-NEXT: movaps %xmm9, %xmm13 4695; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm4[2],xmm13[3],xmm4[3] 4696; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] 4697; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 4698; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] 4699; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4700; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 4701; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] 4702; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 4703; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] 4704; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4705; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 4706; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] 4707; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4708; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 4709; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] 4710; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4711; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 4712; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] 4713; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 4714; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] 4715; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4716; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 4717; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] 4718; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 4719; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] 4720; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4721; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 4722; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] 4723; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 4724; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] 4725; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 4726; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] 4727; SSE-NEXT: movaps %xmm1, %xmm0 4728; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] 4729; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4730; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] 4731; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4732; SSE-NEXT: movaps %xmm8, %xmm1 4733; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] 4734; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4735; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1] 4736; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4737; SSE-NEXT: movaps %xmm7, %xmm12 4738; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] 4739; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] 4740; SSE-NEXT: movaps %xmm5, %xmm13 4741; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4742; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm1[0] 4743; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 4744; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4745; SSE-NEXT: movaps %xmm4, %xmm0 4746; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] 4747; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] 4748; SSE-NEXT: movaps %xmm0, %xmm5 4749; SSE-NEXT: movaps %xmm3, %xmm8 4750; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm14[0] 4751; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1] 4752; SSE-NEXT: movaps %xmm3, %xmm6 4753; SSE-NEXT: movaps %xmm2, %xmm3 4754; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm11[0] 4755; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] 4756; SSE-NEXT: movaps %xmm15, %xmm0 4757; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] 4758; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm10[1] 4759; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4760; SSE-NEXT: movaps %xmm1, 96(%rsi) 4761; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4762; SSE-NEXT: movaps %xmm1, 32(%rsi) 4763; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4764; SSE-NEXT: movaps %xmm1, 112(%rsi) 4765; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4766; SSE-NEXT: movaps %xmm1, 48(%rsi) 4767; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4768; SSE-NEXT: movaps %xmm1, 64(%rsi) 4769; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4770; SSE-NEXT: movaps %xmm1, (%rsi) 4771; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4772; SSE-NEXT: movaps %xmm1, 80(%rsi) 4773; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4774; SSE-NEXT: movaps %xmm1, 16(%rsi) 4775; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4776; SSE-NEXT: movaps %xmm1, 96(%rdx) 4777; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4778; SSE-NEXT: movaps %xmm1, 32(%rdx) 4779; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4780; SSE-NEXT: movaps %xmm1, 112(%rdx) 4781; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4782; SSE-NEXT: movaps %xmm1, 48(%rdx) 4783; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4784; SSE-NEXT: movaps %xmm1, 64(%rdx) 4785; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4786; SSE-NEXT: movaps %xmm1, (%rdx) 4787; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4788; SSE-NEXT: movaps %xmm1, 80(%rdx) 4789; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4790; SSE-NEXT: movaps %xmm1, 16(%rdx) 4791; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4792; SSE-NEXT: movaps %xmm1, 96(%rcx) 4793; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4794; SSE-NEXT: movaps %xmm1, 32(%rcx) 4795; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4796; SSE-NEXT: movaps %xmm1, 112(%rcx) 4797; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4798; SSE-NEXT: movaps %xmm1, 48(%rcx) 4799; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4800; SSE-NEXT: movaps %xmm1, 64(%rcx) 4801; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4802; SSE-NEXT: movaps %xmm1, (%rcx) 4803; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4804; SSE-NEXT: movaps %xmm1, 80(%rcx) 4805; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4806; SSE-NEXT: movaps %xmm1, 16(%rcx) 4807; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4808; SSE-NEXT: movaps %xmm1, 112(%r8) 4809; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4810; SSE-NEXT: movaps %xmm1, 96(%r8) 4811; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4812; SSE-NEXT: movaps %xmm1, 80(%r8) 4813; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4814; SSE-NEXT: movaps %xmm1, 64(%r8) 4815; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4816; SSE-NEXT: movaps %xmm1, 48(%r8) 4817; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4818; SSE-NEXT: movaps %xmm1, 32(%r8) 4819; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4820; SSE-NEXT: movaps %xmm1, 16(%r8) 4821; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4822; SSE-NEXT: movaps %xmm1, (%r8) 4823; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4824; SSE-NEXT: movaps %xmm1, 112(%r9) 4825; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4826; SSE-NEXT: movaps %xmm1, 96(%r9) 4827; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4828; SSE-NEXT: movaps %xmm1, 80(%r9) 4829; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4830; SSE-NEXT: movaps %xmm1, 64(%r9) 4831; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4832; SSE-NEXT: movaps %xmm1, 48(%r9) 4833; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4834; SSE-NEXT: movaps %xmm1, 32(%r9) 4835; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4836; SSE-NEXT: movaps %xmm1, 16(%r9) 4837; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4838; SSE-NEXT: movaps %xmm1, (%r9) 4839; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 4840; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4841; SSE-NEXT: movaps %xmm1, 112(%rax) 4842; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4843; SSE-NEXT: movaps %xmm1, 96(%rax) 4844; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4845; SSE-NEXT: movaps %xmm1, 80(%rax) 4846; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4847; SSE-NEXT: movaps %xmm1, 64(%rax) 4848; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4849; SSE-NEXT: movaps %xmm1, 48(%rax) 4850; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload 4851; SSE-NEXT: movaps %xmm1, 32(%rax) 4852; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4853; SSE-NEXT: movaps %xmm1, 16(%rax) 4854; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4855; SSE-NEXT: movaps %xmm1, (%rax) 4856; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 4857; SSE-NEXT: movaps %xmm3, 112(%rax) 4858; SSE-NEXT: movaps %xmm8, 96(%rax) 4859; SSE-NEXT: movaps %xmm4, 80(%rax) 4860; SSE-NEXT: movaps %xmm13, 64(%rax) 4861; SSE-NEXT: movaps %xmm12, 48(%rax) 4862; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4863; SSE-NEXT: movaps %xmm1, 32(%rax) 4864; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4865; SSE-NEXT: movaps %xmm1, 16(%rax) 4866; SSE-NEXT: movaps %xmm0, (%rax) 4867; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 4868; SSE-NEXT: movaps %xmm2, 112(%rax) 4869; SSE-NEXT: movaps %xmm6, 96(%rax) 4870; SSE-NEXT: movaps %xmm5, 80(%rax) 4871; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4872; SSE-NEXT: movaps %xmm0, 64(%rax) 4873; SSE-NEXT: movaps %xmm7, 48(%rax) 4874; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4875; SSE-NEXT: movaps %xmm0, 32(%rax) 4876; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4877; SSE-NEXT: movaps %xmm0, 16(%rax) 4878; SSE-NEXT: movaps %xmm15, (%rax) 4879; SSE-NEXT: addq $952, %rsp # imm = 0x3B8 4880; SSE-NEXT: retq 4881; 4882; AVX-LABEL: load_i32_stride8_vf32: 4883; AVX: # %bb.0: 4884; AVX-NEXT: subq $1768, %rsp # imm = 0x6E8 4885; AVX-NEXT: vmovaps 288(%rdi), %xmm14 4886; AVX-NEXT: vmovaps 256(%rdi), %xmm10 4887; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] 4888; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4889; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4890; AVX-NEXT: vmovaps 352(%rdi), %xmm1 4891; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4892; AVX-NEXT: vmovaps 320(%rdi), %xmm2 4893; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4894; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4895; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4896; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4897; AVX-NEXT: vmovaps 416(%rdi), %xmm1 4898; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill 4899; AVX-NEXT: vmovaps 384(%rdi), %xmm2 4900; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4901; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4902; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 4903; AVX-NEXT: vmovaps 480(%rdi), %xmm2 4904; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4905; AVX-NEXT: vmovaps 448(%rdi), %xmm3 4906; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4907; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 4908; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4909; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] 4910; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 4911; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] 4912; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4913; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4914; AVX-NEXT: vmovaps 928(%rdi), %xmm0 4915; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4916; AVX-NEXT: vmovaps 896(%rdi), %xmm1 4917; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4918; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4919; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 4920; AVX-NEXT: vmovaps 992(%rdi), %xmm1 4921; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4922; AVX-NEXT: vmovaps 960(%rdi), %xmm2 4923; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4924; AVX-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4925; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm9[0,1,0,1] 4926; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 4927; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 4928; AVX-NEXT: vmovaps 800(%rdi), %xmm1 4929; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4930; AVX-NEXT: vmovaps 768(%rdi), %xmm11 4931; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] 4932; AVX-NEXT: vmovaps 864(%rdi), %xmm2 4933; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4934; AVX-NEXT: vmovaps 832(%rdi), %xmm3 4935; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4936; AVX-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 4937; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] 4938; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 4939; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4940; AVX-NEXT: vmovaps 160(%rdi), %xmm1 4941; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4942; AVX-NEXT: vmovaps 128(%rdi), %xmm0 4943; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4944; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4945; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 4946; AVX-NEXT: vmovaps 224(%rdi), %xmm1 4947; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4948; AVX-NEXT: vmovaps 192(%rdi), %xmm2 4949; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4950; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4951; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4952; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] 4953; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 4954; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 4955; AVX-NEXT: vmovaps 32(%rdi), %xmm1 4956; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4957; AVX-NEXT: vmovaps (%rdi), %xmm13 4958; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] 4959; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4960; AVX-NEXT: vmovaps 96(%rdi), %xmm2 4961; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4962; AVX-NEXT: vmovaps 64(%rdi), %xmm3 4963; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4964; AVX-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 4965; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] 4966; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 4967; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4968; AVX-NEXT: vmovaps 672(%rdi), %xmm12 4969; AVX-NEXT: vmovaps 640(%rdi), %xmm0 4970; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4971; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] 4972; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 4973; AVX-NEXT: vmovaps 736(%rdi), %xmm1 4974; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4975; AVX-NEXT: vmovaps 704(%rdi), %xmm2 4976; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4977; AVX-NEXT: vunpcklps {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4978; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm8[0,1,0,1] 4979; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 4980; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm4[6,7] 4981; AVX-NEXT: vmovaps 544(%rdi), %xmm6 4982; AVX-NEXT: vmovaps 512(%rdi), %xmm3 4983; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] 4984; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4985; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4986; AVX-NEXT: vmovaps 608(%rdi), %xmm4 4987; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4988; AVX-NEXT: vmovaps 576(%rdi), %xmm2 4989; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4990; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 4991; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 4992; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4993; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4994; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1] 4995; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] 4996; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4997; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3] 4998; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload 4999; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5000; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1] 5001; AVX-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload 5002; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm14[1],xmm15[2,3] 5003; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 5004; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] 5005; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5006; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5007; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] 5008; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 5009; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] 5010; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] 5011; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 5012; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 5013; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm9[1,1,1,1] 5014; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 5015; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2,3] 5016; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 5017; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] 5018; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5019; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5020; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm13[1,1,1,1] 5021; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 5022; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 5023; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] 5024; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload 5025; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 5026; AVX-NEXT: # xmm5 = mem[1,1,1,1] 5027; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 5028; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3] 5029; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 5030; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] 5031; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5032; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5033; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1,1,1] 5034; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] 5035; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] 5036; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 5037; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 5038; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,1,1] 5039; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm12[1],xmm2[2,3] 5040; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 5041; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 5042; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5043; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5044; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5045; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload 5046; AVX-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] 5047; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5048; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm14[2],xmm4[3],xmm14[3] 5049; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5050; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 5051; AVX-NEXT: # xmm0 = mem[2,2,2,2] 5052; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 5053; AVX-NEXT: # xmm0 = mem[0,1,2],xmm0[3] 5054; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 5055; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 5056; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] 5057; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 5058; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,2,2,2] 5059; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5060; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] 5061; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] 5062; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 5063; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5064; AVX-NEXT: vunpckhps {{.*#+}} xmm4 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] 5065; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5066; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm15[2],xmm9[3],xmm15[3] 5067; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 5068; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5069; AVX-NEXT: # xmm2 = mem[2,2,2,2] 5070; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 5071; AVX-NEXT: # xmm2 = mem[0,1,2],xmm2[3] 5072; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 5073; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 5074; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] 5075; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 5076; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm14[2,2,2,2] 5077; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5078; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3] 5079; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] 5080; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 5081; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5082; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5083; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload 5084; AVX-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] 5085; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5086; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5087; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] 5088; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5089; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 5090; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2] 5091; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 5092; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] 5093; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 5094; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 5095; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] 5096; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 5097; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm9[2,2,2,2] 5098; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5099; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] 5100; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3] 5101; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] 5102; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5103; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5104; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload 5105; AVX-NEXT: # xmm11 = xmm2[2],mem[2],xmm2[3],mem[3] 5106; AVX-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] 5107; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 5108; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2] 5109; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 5110; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] 5111; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 5112; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm8 5113; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] 5114; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5115; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm4[2,2,2,2] 5116; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5117; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3] 5118; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] 5119; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] 5120; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5121; AVX-NEXT: vunpckhps {{.*#+}} xmm6 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] 5122; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5123; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm1[1],xmm6[1] 5124; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5125; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload 5126; AVX-NEXT: # xmm7 = xmm1[2],mem[2],xmm1[3],mem[3] 5127; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 5128; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 5129; AVX-NEXT: # xmm1 = mem[2,3,2,3] 5130; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 5131; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] 5132; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] 5133; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5134; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] 5135; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5136; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1] 5137; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5138; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload 5139; AVX-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] 5140; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 5141; AVX-NEXT: vpermilps $238, (%rsp), %xmm0 # 16-byte Folded Reload 5142; AVX-NEXT: # xmm0 = mem[2,3,2,3] 5143; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 5144; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] 5145; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5146; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5147; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] 5148; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5149; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 5150; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] 5151; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 5152; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 5153; AVX-NEXT: # xmm3 = mem[2,3,2,3] 5154; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 5155; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] 5156; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5157; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5158; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] 5159; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm0[1] 5160; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm10[2],xmm5[3],xmm10[3] 5161; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 5162; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3] 5163; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 5164; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 5165; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5166; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5167; AVX-NEXT: vmovaps 416(%rdi), %ymm12 5168; AVX-NEXT: vmovaps 384(%rdi), %ymm9 5169; AVX-NEXT: vmovaps 448(%rdi), %ymm7 5170; AVX-NEXT: vmovaps 480(%rdi), %ymm11 5171; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] 5172; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5173; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[4],ymm12[4],ymm9[5],ymm12[5] 5174; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5175; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] 5176; AVX-NEXT: vmovaps 320(%rdi), %ymm2 5177; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5178; AVX-NEXT: vmovaps 352(%rdi), %ymm1 5179; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5180; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] 5181; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 5182; AVX-NEXT: vmovaps 288(%rdi), %ymm2 5183; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5184; AVX-NEXT: vmovaps 256(%rdi), %ymm10 5185; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm10[0],ymm2[0],ymm10[1],ymm2[1],ymm10[4],ymm2[4],ymm10[5],ymm2[5] 5186; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5187; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 5188; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] 5189; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5190; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5191; AVX-NEXT: vmovaps 928(%rdi), %ymm5 5192; AVX-NEXT: vmovaps 896(%rdi), %ymm3 5193; AVX-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill 5194; AVX-NEXT: vmovaps 960(%rdi), %ymm1 5195; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5196; AVX-NEXT: vmovaps 992(%rdi), %ymm0 5197; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5198; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 5199; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] 5200; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5201; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] 5202; AVX-NEXT: vmovaps 832(%rdi), %ymm2 5203; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5204; AVX-NEXT: vmovaps 864(%rdi), %ymm1 5205; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5206; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] 5207; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 5208; AVX-NEXT: vmovaps 800(%rdi), %ymm3 5209; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5210; AVX-NEXT: vmovaps 768(%rdi), %ymm2 5211; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5212; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 5213; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 5214; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] 5215; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5216; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5217; AVX-NEXT: vmovaps 672(%rdi), %ymm2 5218; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5219; AVX-NEXT: vmovaps 640(%rdi), %ymm3 5220; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5221; AVX-NEXT: vmovaps 704(%rdi), %ymm13 5222; AVX-NEXT: vmovaps 736(%rdi), %ymm0 5223; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5224; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] 5225; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5226; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 5227; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] 5228; AVX-NEXT: vmovaps 576(%rdi), %ymm1 5229; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5230; AVX-NEXT: vmovaps 608(%rdi), %ymm0 5231; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5232; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 5233; AVX-NEXT: vextractf128 $1, %ymm0, %xmm6 5234; AVX-NEXT: vmovaps 544(%rdi), %ymm0 5235; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5236; AVX-NEXT: vmovaps 512(%rdi), %ymm1 5237; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5238; AVX-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 5239; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 5240; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,0] 5241; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm4[4,5,6,7] 5242; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5243; AVX-NEXT: vmovaps 160(%rdi), %ymm0 5244; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5245; AVX-NEXT: vmovaps 128(%rdi), %ymm1 5246; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5247; AVX-NEXT: vmovaps 192(%rdi), %ymm2 5248; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5249; AVX-NEXT: vmovaps 224(%rdi), %ymm3 5250; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5251; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] 5252; AVX-NEXT: vunpcklps {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 5253; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm4[2,0],ymm6[4,5],ymm4[6,4] 5254; AVX-NEXT: vmovaps 64(%rdi), %ymm0 5255; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5256; AVX-NEXT: vmovaps 96(%rdi), %ymm2 5257; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5258; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] 5259; AVX-NEXT: vextractf128 $1, %ymm4, %xmm0 5260; AVX-NEXT: vmovaps (%rdi), %ymm2 5261; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5262; AVX-NEXT: vmovaps 32(%rdi), %ymm14 5263; AVX-NEXT: vunpcklps {{.*#+}} ymm15 = ymm2[0],ymm14[0],ymm2[1],ymm14[1],ymm2[4],ymm14[4],ymm2[5],ymm14[5] 5264; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 5265; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0] 5266; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5267; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5268; AVX-NEXT: vmovaps %ymm11, %ymm6 5269; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5270; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm11[0],ymm7[1],ymm11[1],ymm7[4],ymm11[4],ymm7[5],ymm11[5] 5271; AVX-NEXT: vmovaps %ymm12, %ymm7 5272; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5273; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm9[1,0],ymm12[5,4],ymm9[5,4] 5274; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] 5275; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 5276; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5277; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm9[0],ymm3[1],ymm9[1],ymm3[4],ymm9[4],ymm3[5],ymm9[5] 5278; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 5279; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 5280; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm11[1,0],ymm10[1,0],ymm11[5,4],ymm10[5,4] 5281; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 5282; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] 5283; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5284; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5285; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 5286; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 5287; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm2[0],ymm10[0],ymm2[1],ymm10[1],ymm2[4],ymm10[4],ymm2[5],ymm10[5] 5288; AVX-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload 5289; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm12[1,0],ymm5[5,4],ymm12[5,4] 5290; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] 5291; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5292; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload 5293; AVX-NEXT: # ymm1 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[4],mem[4],ymm5[5],mem[5] 5294; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 5295; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 5296; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm15 # 32-byte Folded Reload 5297; AVX-NEXT: # ymm15 = ymm8[1,0],mem[1,0],ymm8[5,4],mem[5,4] 5298; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 5299; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] 5300; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5301; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5302; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5303; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[4],ymm4[4],ymm13[5],ymm4[5] 5304; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 5305; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload 5306; AVX-NEXT: # ymm1 = ymm13[1,0],mem[1,0],ymm13[5,4],mem[5,4] 5307; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] 5308; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5309; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 5310; AVX-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] 5311; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 5312; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 5313; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload 5314; AVX-NEXT: # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] 5315; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 5316; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] 5317; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5318; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5319; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5320; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 5321; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 5322; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5323; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 5324; AVX-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] 5325; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] 5326; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5327; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 5328; AVX-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] 5329; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 5330; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload 5331; AVX-NEXT: # ymm15 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] 5332; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 5333; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] 5334; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5335; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5336; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload 5337; AVX-NEXT: # ymm0 = ymm6[1],mem[1],ymm6[3],mem[3] 5338; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5339; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] 5340; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] 5341; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm3[1],ymm9[3],ymm3[3] 5342; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 5343; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 5344; AVX-NEXT: vunpckhps {{.*#+}} ymm15 = ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[6],ymm11[6],ymm9[7],ymm11[7] 5345; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 5346; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,0] 5347; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5348; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5349; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm2[1],ymm10[3],ymm2[3] 5350; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload 5351; AVX-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] 5352; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] 5353; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5354; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] 5355; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 5356; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 5357; AVX-NEXT: vunpckhps {{.*#+}} ymm15 = ymm2[2],ymm8[2],ymm2[3],ymm8[3],ymm2[6],ymm8[6],ymm2[7],ymm8[7] 5358; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 5359; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,0] 5360; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5361; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5362; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5363; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] 5364; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5365; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm13[2],ymm4[3],ymm13[3],ymm4[6],ymm13[6],ymm4[7],ymm13[7] 5366; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] 5367; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5368; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 5369; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm2[1],ymm6[3],ymm2[3] 5370; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 5371; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 5372; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5373; AVX-NEXT: vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[6],ymm3[6],ymm7[7],ymm3[7] 5374; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 5375; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,0] 5376; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5377; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5378; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 5379; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5380; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] 5381; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 5382; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 5383; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] 5384; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] 5385; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 5386; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5387; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] 5388; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 5389; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 5390; AVX-NEXT: vunpckhps {{.*#+}} ymm15 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] 5391; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 5392; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,0] 5393; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5394; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5395; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5396; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 5397; AVX-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 5398; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5399; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 5400; AVX-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] 5401; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] 5402; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5403; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 5404; AVX-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 5405; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 5406; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[3,0],ymm9[3,0],ymm15[7,4],ymm9[7,4] 5407; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 5408; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 5409; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] 5410; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5411; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload 5412; AVX-NEXT: # ymm0 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] 5413; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5414; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm4[3,0],ymm1[7,4],ymm4[7,4] 5415; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] 5416; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[6],ymm6[6],ymm2[7],ymm6[7] 5417; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm7[3,0],ymm3[7,4],ymm7[7,4] 5418; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 5419; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 5420; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] 5421; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5422; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5423; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 5424; AVX-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 5425; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 5426; AVX-NEXT: vshufps $51, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload 5427; AVX-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] 5428; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[2,3],ymm2[6,4],ymm1[6,7] 5429; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 5430; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 5431; AVX-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] 5432; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5433; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 5434; AVX-NEXT: # ymm3 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] 5435; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 5436; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 5437; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,3] 5438; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 5439; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload 5440; AVX-NEXT: # ymm2 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] 5441; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm10[3,0],ymm12[3,0],ymm10[7,4],ymm12[7,4] 5442; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7] 5443; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload 5444; AVX-NEXT: # ymm3 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] 5445; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm14[3,0],ymm13[3,0],ymm14[7,4],ymm13[7,4] 5446; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 5447; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 5448; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,3] 5449; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 5450; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5451; AVX-NEXT: vmovaps %ymm3, 64(%rsi) 5452; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5453; AVX-NEXT: vmovaps %ymm3, (%rsi) 5454; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5455; AVX-NEXT: vmovaps %ymm3, 96(%rsi) 5456; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5457; AVX-NEXT: vmovaps %ymm3, 32(%rsi) 5458; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5459; AVX-NEXT: vmovaps %ymm3, 64(%rdx) 5460; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5461; AVX-NEXT: vmovaps %ymm3, (%rdx) 5462; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5463; AVX-NEXT: vmovaps %ymm3, 96(%rdx) 5464; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5465; AVX-NEXT: vmovaps %ymm3, 32(%rdx) 5466; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5467; AVX-NEXT: vmovaps %ymm3, 64(%rcx) 5468; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5469; AVX-NEXT: vmovaps %ymm3, (%rcx) 5470; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5471; AVX-NEXT: vmovaps %ymm3, 96(%rcx) 5472; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5473; AVX-NEXT: vmovaps %ymm3, 32(%rcx) 5474; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5475; AVX-NEXT: vmovaps %ymm3, 64(%r8) 5476; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5477; AVX-NEXT: vmovaps %ymm3, (%r8) 5478; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5479; AVX-NEXT: vmovaps %ymm3, 96(%r8) 5480; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5481; AVX-NEXT: vmovaps %ymm3, 32(%r8) 5482; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5483; AVX-NEXT: vmovaps %ymm3, 64(%r9) 5484; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5485; AVX-NEXT: vmovaps %ymm3, (%r9) 5486; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5487; AVX-NEXT: vmovaps %ymm3, 96(%r9) 5488; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5489; AVX-NEXT: vmovaps %ymm3, 32(%r9) 5490; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 5491; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5492; AVX-NEXT: vmovaps %ymm3, 64(%rax) 5493; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5494; AVX-NEXT: vmovaps %ymm3, (%rax) 5495; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5496; AVX-NEXT: vmovaps %ymm3, 96(%rax) 5497; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5498; AVX-NEXT: vmovaps %ymm3, 32(%rax) 5499; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 5500; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5501; AVX-NEXT: vmovaps %ymm3, 64(%rax) 5502; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5503; AVX-NEXT: vmovaps %ymm3, (%rax) 5504; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5505; AVX-NEXT: vmovaps %ymm3, 96(%rax) 5506; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5507; AVX-NEXT: vmovaps %ymm3, 32(%rax) 5508; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 5509; AVX-NEXT: vmovaps %ymm1, 96(%rax) 5510; AVX-NEXT: vmovaps %ymm0, 64(%rax) 5511; AVX-NEXT: vmovaps %ymm9, 32(%rax) 5512; AVX-NEXT: vmovaps %ymm2, (%rax) 5513; AVX-NEXT: addq $1768, %rsp # imm = 0x6E8 5514; AVX-NEXT: vzeroupper 5515; AVX-NEXT: retq 5516; 5517; AVX2-LABEL: load_i32_stride8_vf32: 5518; AVX2: # %bb.0: 5519; AVX2-NEXT: subq $1544, %rsp # imm = 0x608 5520; AVX2-NEXT: vmovaps 288(%rdi), %xmm0 5521; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5522; AVX2-NEXT: vmovaps 256(%rdi), %xmm10 5523; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] 5524; AVX2-NEXT: vmovaps 352(%rdi), %xmm14 5525; AVX2-NEXT: vbroadcastss %xmm14, %xmm1 5526; AVX2-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5527; AVX2-NEXT: vmovaps 320(%rdi), %xmm15 5528; AVX2-NEXT: vbroadcastss %xmm15, %xmm2 5529; AVX2-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5530; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 5531; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 5532; AVX2-NEXT: vmovaps 416(%rdi), %xmm1 5533; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5534; AVX2-NEXT: vmovaps 384(%rdi), %xmm12 5535; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] 5536; AVX2-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5537; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 5538; AVX2-NEXT: vmovaps 480(%rdi), %xmm2 5539; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5540; AVX2-NEXT: vbroadcastss %xmm2, %xmm2 5541; AVX2-NEXT: vmovaps 448(%rdi), %xmm3 5542; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5543; AVX2-NEXT: vbroadcastss %xmm3, %xmm3 5544; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 5545; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 5546; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] 5547; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5548; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5549; AVX2-NEXT: vmovaps 800(%rdi), %xmm1 5550; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5551; AVX2-NEXT: vmovaps 768(%rdi), %xmm0 5552; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5553; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 5554; AVX2-NEXT: vmovaps 864(%rdi), %xmm1 5555; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5556; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 5557; AVX2-NEXT: vmovaps 832(%rdi), %xmm2 5558; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5559; AVX2-NEXT: vbroadcastss %xmm2, %xmm2 5560; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 5561; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 5562; AVX2-NEXT: vmovaps 992(%rdi), %xmm1 5563; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5564; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 5565; AVX2-NEXT: vmovaps 960(%rdi), %xmm11 5566; AVX2-NEXT: vbroadcastss %xmm11, %xmm2 5567; AVX2-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5568; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 5569; AVX2-NEXT: vmovaps 928(%rdi), %xmm2 5570; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5571; AVX2-NEXT: vmovaps 896(%rdi), %xmm3 5572; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5573; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 5574; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 5575; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 5576; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 5577; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5578; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5579; AVX2-NEXT: vmovaps 608(%rdi), %xmm0 5580; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5581; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 5582; AVX2-NEXT: vmovaps 576(%rdi), %xmm1 5583; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5584; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 5585; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 5586; AVX2-NEXT: vmovaps 544(%rdi), %xmm1 5587; AVX2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill 5588; AVX2-NEXT: vmovaps 512(%rdi), %xmm2 5589; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5590; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 5591; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 5592; AVX2-NEXT: vmovaps 736(%rdi), %xmm1 5593; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5594; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 5595; AVX2-NEXT: vmovaps 704(%rdi), %xmm2 5596; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5597; AVX2-NEXT: vbroadcastss %xmm2, %xmm2 5598; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 5599; AVX2-NEXT: vmovaps 672(%rdi), %xmm2 5600; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5601; AVX2-NEXT: vmovaps 640(%rdi), %xmm3 5602; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5603; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 5604; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 5605; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 5606; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 5607; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5608; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5609; AVX2-NEXT: vmovaps 224(%rdi), %xmm0 5610; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5611; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 5612; AVX2-NEXT: vmovaps 192(%rdi), %xmm1 5613; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5614; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 5615; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 5616; AVX2-NEXT: vmovaps 160(%rdi), %xmm9 5617; AVX2-NEXT: vmovaps 128(%rdi), %xmm8 5618; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] 5619; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5620; AVX2-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5621; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 5622; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 5623; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7] 5624; AVX2-NEXT: vmovaps 96(%rdi), %xmm7 5625; AVX2-NEXT: vbroadcastss %xmm7, %xmm1 5626; AVX2-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5627; AVX2-NEXT: vmovaps 64(%rdi), %xmm6 5628; AVX2-NEXT: vbroadcastss %xmm6, %xmm2 5629; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5630; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 5631; AVX2-NEXT: vmovaps (%rdi), %xmm5 5632; AVX2-NEXT: vmovaps 32(%rdi), %xmm4 5633; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 5634; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5635; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5636; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3] 5637; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7] 5638; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5639; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1] 5640; AVX2-NEXT: vmovaps %xmm10, %xmm3 5641; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 5642; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3] 5643; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 5644; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 5645; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 5646; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 5647; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 5648; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 5649; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1] 5650; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 5651; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 5652; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 5653; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 5654; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5655; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5656; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 5657; AVX2-NEXT: # xmm0 = mem[1,1,1,1] 5658; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 5659; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 5660; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 5661; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload 5662; AVX2-NEXT: # xmm1 = xmm12[0],mem[0],xmm12[1],mem[1] 5663; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 5664; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload 5665; AVX2-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] 5666; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 5667; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 5668; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1] 5669; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 5670; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] 5671; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 5672; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 5673; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5674; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5675; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] 5676; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] 5677; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 5678; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 5679; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5680; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 5681; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 5682; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 5683; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1] 5684; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] 5685; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 5686; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 5687; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5688; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5689; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 5690; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] 5691; AVX2-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload 5692; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] 5693; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 5694; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5695; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] 5696; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 5697; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5698; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5699; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 5700; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 5701; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 5702; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1] 5703; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 5704; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3] 5705; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 5706; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 5707; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5708; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5709; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] 5710; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5711; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5712; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 5713; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 5714; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5715; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2] 5716; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] 5717; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 5718; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 5719; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 5720; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 5721; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2] 5722; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5723; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] 5724; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] 5725; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5726; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5727; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5728; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload 5729; AVX2-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] 5730; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5731; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] 5732; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5733; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 5734; AVX2-NEXT: # xmm15 = mem[2,2,2,2] 5735; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5736; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] 5737; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 5738; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 5739; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 5740; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5741; AVX2-NEXT: vshufps {{.*#+}} xmm15 = xmm0[2,2,2,2] 5742; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm12[0,1,2],xmm15[3] 5743; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm3[0,1],xmm15[2,3] 5744; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] 5745; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5746; AVX2-NEXT: vunpckhps {{.*#+}} xmm14 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] 5747; AVX2-NEXT: vunpckhps {{.*#+}} xmm12 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] 5748; AVX2-NEXT: vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2] 5749; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 5750; AVX2-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] 5751; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 5752; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm15 5753; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] 5754; AVX2-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2] 5755; AVX2-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload 5756; AVX2-NEXT: # xmm15 = mem[0,1,2],xmm15[3] 5757; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3] 5758; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] 5759; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5760; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5761; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload 5762; AVX2-NEXT: # xmm9 = xmm3[2],mem[2],xmm3[3],mem[3] 5763; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 5764; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm15[2,2,2,2] 5765; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5766; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] 5767; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 5768; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm11 5769; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7] 5770; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5771; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload 5772; AVX2-NEXT: # xmm6 = xmm3[2],mem[2],xmm3[3],mem[3] 5773; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 5774; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,2,2] 5775; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 5776; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] 5777; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] 5778; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] 5779; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5780; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] 5781; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5782; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] 5783; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5784; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload 5785; AVX2-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] 5786; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 5787; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 5788; AVX2-NEXT: # xmm3 = mem[2,3,2,3] 5789; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 5790; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] 5791; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 5792; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5793; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5794; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 5795; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5796; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 5797; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload 5798; AVX2-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] 5799; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 5800; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 5801; AVX2-NEXT: # xmm1 = mem[2,3,2,3] 5802; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 5803; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] 5804; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5805; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5806; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5807; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 5808; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 5809; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] 5810; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload 5811; AVX2-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] 5812; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 5813; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3] 5814; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 5815; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 5816; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5817; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5818; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm15[2],xmm4[3],xmm15[3] 5819; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 5820; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3] 5821; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 5822; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 5823; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] 5824; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm1[1] 5825; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5826; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5827; AVX2-NEXT: vmovaps 288(%rdi), %ymm0 5828; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5829; AVX2-NEXT: vmovaps 256(%rdi), %ymm1 5830; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5831; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 5832; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 5833; AVX2-NEXT: vmovaps 352(%rdi), %ymm1 5834; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5835; AVX2-NEXT: vmovaps 320(%rdi), %ymm2 5836; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5837; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 5838; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,2,2,2] 5839; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 5840; AVX2-NEXT: vmovaps 480(%rdi), %ymm2 5841; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5842; AVX2-NEXT: vmovaps 448(%rdi), %ymm3 5843; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5844; AVX2-NEXT: vmovaps 416(%rdi), %ymm8 5845; AVX2-NEXT: vmovaps 384(%rdi), %ymm1 5846; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5847; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5] 5848; AVX2-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 5849; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] 5850; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5851; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5852; AVX2-NEXT: vmovaps 800(%rdi), %ymm0 5853; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5854; AVX2-NEXT: vmovaps 768(%rdi), %ymm1 5855; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5856; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 5857; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 5858; AVX2-NEXT: vmovaps 864(%rdi), %ymm1 5859; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5860; AVX2-NEXT: vmovaps 832(%rdi), %ymm2 5861; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5862; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 5863; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill 5864; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 5865; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 5866; AVX2-NEXT: vmovaps 992(%rdi), %ymm2 5867; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5868; AVX2-NEXT: vmovaps 960(%rdi), %ymm3 5869; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5870; AVX2-NEXT: vmovaps 928(%rdi), %ymm7 5871; AVX2-NEXT: vmovaps 896(%rdi), %ymm1 5872; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5873; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] 5874; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5875; AVX2-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 5876; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] 5877; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5878; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5879; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 5880; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5881; AVX2-NEXT: vmovaps (%rdi), %ymm1 5882; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5883; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 5884; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 5885; AVX2-NEXT: vmovaps 96(%rdi), %ymm1 5886; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5887; AVX2-NEXT: vmovaps 64(%rdi), %ymm2 5888; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5889; AVX2-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 5890; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] 5891; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 5892; AVX2-NEXT: vmovaps 224(%rdi), %ymm2 5893; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5894; AVX2-NEXT: vmovaps 192(%rdi), %ymm3 5895; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5896; AVX2-NEXT: vmovaps 160(%rdi), %ymm15 5897; AVX2-NEXT: vmovaps 128(%rdi), %ymm1 5898; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5899; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] 5900; AVX2-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 5901; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] 5902; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5903; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5904; AVX2-NEXT: vmovaps 544(%rdi), %ymm14 5905; AVX2-NEXT: vmovaps 512(%rdi), %ymm11 5906; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5] 5907; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5908; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 5909; AVX2-NEXT: vmovaps 608(%rdi), %ymm1 5910; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5911; AVX2-NEXT: vmovaps 576(%rdi), %ymm2 5912; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5913; AVX2-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 5914; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[2,2,2,2] 5915; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm5[2,3] 5916; AVX2-NEXT: vmovaps 736(%rdi), %ymm2 5917; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5918; AVX2-NEXT: vmovaps 704(%rdi), %ymm3 5919; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5920; AVX2-NEXT: vmovaps 672(%rdi), %ymm5 5921; AVX2-NEXT: vmovaps 640(%rdi), %ymm0 5922; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5923; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] 5924; AVX2-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 5925; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] 5926; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5927; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5928; AVX2-NEXT: vbroadcastss 404(%rdi), %ymm0 5929; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] 5930; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] 5931; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm1 5932; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5933; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm6[1,1,1,1,5,5,5,5] 5934; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5935; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm3[1],ymm10[2,3,4],ymm3[5],ymm10[6,7] 5936; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm10 5937; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] 5938; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5939; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5940; AVX2-NEXT: vbroadcastss 916(%rdi), %ymm0 5941; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] 5942; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] 5943; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload 5944; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 5945; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 5946; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5] 5947; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 5948; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] 5949; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm7 5950; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] 5951; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5952; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 5953; AVX2-NEXT: vbroadcastss 148(%rdi), %ymm0 5954; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] 5955; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] 5956; AVX2-NEXT: vextractf128 $1, %ymm13, %xmm1 5957; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 5958; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5] 5959; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 5960; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7] 5961; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4 5962; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] 5963; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5964; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5965; AVX2-NEXT: vbroadcastss 660(%rdi), %ymm0 5966; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] 5967; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 5968; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm1 5969; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5] 5970; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] 5971; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 5972; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 5973; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5974; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5975; AVX2-NEXT: vbroadcastss 504(%rdi), %ymm0 5976; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 5977; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7] 5978; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5979; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] 5980; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] 5981; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5982; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 5983; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload 5984; AVX2-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] 5985; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm2 5986; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm6[2,2,2,2] 5987; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] 5988; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] 5989; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 5990; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5991; AVX2-NEXT: vbroadcastss 1016(%rdi), %ymm1 5992; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 5993; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7] 5994; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 5995; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload 5996; AVX2-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] 5997; AVX2-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] 5998; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 5999; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload 6000; AVX2-NEXT: # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] 6001; AVX2-NEXT: vextractf128 $1, %ymm12, %xmm2 6002; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm10[2,2,2,2] 6003; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] 6004; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] 6005; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 6006; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6007; AVX2-NEXT: vbroadcastss 248(%rdi), %ymm2 6008; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 6009; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] 6010; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6011; AVX2-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] 6012; AVX2-NEXT: vunpckhps {{.*#+}} ymm9 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] 6013; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6014; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload 6015; AVX2-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 6016; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm8 6017; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] 6018; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] 6019; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] 6020; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7] 6021; AVX2-NEXT: vbroadcastss 760(%rdi), %ymm2 6022; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 6023; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] 6024; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6025; AVX2-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] 6026; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 6027; AVX2-NEXT: # ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] 6028; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6029; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload 6030; AVX2-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 6031; AVX2-NEXT: vextractf128 $1, %ymm11, %xmm15 6032; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] 6033; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] 6034; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] 6035; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] 6036; AVX2-NEXT: vbroadcastss 220(%rdi), %ymm1 6037; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 6038; AVX2-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 6039; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] 6040; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm4 6041; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] 6042; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm9 6043; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] 6044; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7] 6045; AVX2-NEXT: vbroadcastss 476(%rdi), %ymm1 6046; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 6047; AVX2-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 6048; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 6049; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm1 6050; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 6051; AVX2-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] 6052; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 6053; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] 6054; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 6055; AVX2-NEXT: vbroadcastss 732(%rdi), %ymm1 6056; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 6057; AVX2-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 6058; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] 6059; AVX2-NEXT: vextractf128 $1, %ymm13, %xmm5 6060; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7] 6061; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 6062; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] 6063; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7] 6064; AVX2-NEXT: vbroadcastss 988(%rdi), %ymm1 6065; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 6066; AVX2-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 6067; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] 6068; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm3 6069; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm12[2,3,2,3,6,7,6,7] 6070; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 6071; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] 6072; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] 6073; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6074; AVX2-NEXT: vmovaps %ymm3, 64(%rsi) 6075; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6076; AVX2-NEXT: vmovaps %ymm3, (%rsi) 6077; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6078; AVX2-NEXT: vmovaps %ymm3, 96(%rsi) 6079; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6080; AVX2-NEXT: vmovaps %ymm3, 32(%rsi) 6081; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6082; AVX2-NEXT: vmovaps %ymm3, 64(%rdx) 6083; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6084; AVX2-NEXT: vmovaps %ymm3, (%rdx) 6085; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6086; AVX2-NEXT: vmovaps %ymm3, 96(%rdx) 6087; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6088; AVX2-NEXT: vmovaps %ymm3, 32(%rdx) 6089; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6090; AVX2-NEXT: vmovaps %ymm3, 64(%rcx) 6091; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6092; AVX2-NEXT: vmovaps %ymm3, (%rcx) 6093; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6094; AVX2-NEXT: vmovaps %ymm3, 96(%rcx) 6095; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6096; AVX2-NEXT: vmovaps %ymm3, 32(%rcx) 6097; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6098; AVX2-NEXT: vmovaps %ymm3, 64(%r8) 6099; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6100; AVX2-NEXT: vmovaps %ymm3, (%r8) 6101; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6102; AVX2-NEXT: vmovaps %ymm3, 96(%r8) 6103; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6104; AVX2-NEXT: vmovaps %ymm3, 32(%r8) 6105; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6106; AVX2-NEXT: vmovaps %ymm3, 64(%r9) 6107; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6108; AVX2-NEXT: vmovaps %ymm3, (%r9) 6109; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6110; AVX2-NEXT: vmovaps %ymm3, 96(%r9) 6111; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6112; AVX2-NEXT: vmovaps %ymm3, 32(%r9) 6113; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 6114; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6115; AVX2-NEXT: vmovaps %ymm3, 64(%rax) 6116; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6117; AVX2-NEXT: vmovaps %ymm3, (%rax) 6118; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload 6119; AVX2-NEXT: vmovaps %ymm3, 96(%rax) 6120; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6121; AVX2-NEXT: vmovaps %ymm3, 32(%rax) 6122; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 6123; AVX2-NEXT: vmovaps %ymm2, 64(%rax) 6124; AVX2-NEXT: vmovaps %ymm8, (%rax) 6125; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6126; AVX2-NEXT: vmovaps %ymm2, 96(%rax) 6127; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6128; AVX2-NEXT: vmovaps %ymm2, 32(%rax) 6129; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 6130; AVX2-NEXT: vmovaps %ymm1, 96(%rax) 6131; AVX2-NEXT: vmovaps %ymm5, 64(%rax) 6132; AVX2-NEXT: vmovaps %ymm0, 32(%rax) 6133; AVX2-NEXT: vmovaps %ymm4, (%rax) 6134; AVX2-NEXT: addq $1544, %rsp # imm = 0x608 6135; AVX2-NEXT: vzeroupper 6136; AVX2-NEXT: retq 6137; 6138; AVX2-FP-LABEL: load_i32_stride8_vf32: 6139; AVX2-FP: # %bb.0: 6140; AVX2-FP-NEXT: subq $1544, %rsp # imm = 0x608 6141; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm0 6142; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6143; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm10 6144; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] 6145; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm14 6146; AVX2-FP-NEXT: vbroadcastss %xmm14, %xmm1 6147; AVX2-FP-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6148; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm15 6149; AVX2-FP-NEXT: vbroadcastss %xmm15, %xmm2 6150; AVX2-FP-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6151; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 6152; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 6153; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm1 6154; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6155; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm12 6156; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] 6157; AVX2-FP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6158; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6159; AVX2-FP-NEXT: vmovaps 480(%rdi), %xmm2 6160; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6161; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2 6162; AVX2-FP-NEXT: vmovaps 448(%rdi), %xmm3 6163; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6164; AVX2-FP-NEXT: vbroadcastss %xmm3, %xmm3 6165; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 6166; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 6167; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] 6168; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6169; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6170; AVX2-FP-NEXT: vmovaps 800(%rdi), %xmm1 6171; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6172; AVX2-FP-NEXT: vmovaps 768(%rdi), %xmm0 6173; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6174; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 6175; AVX2-FP-NEXT: vmovaps 864(%rdi), %xmm1 6176; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6177; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1 6178; AVX2-FP-NEXT: vmovaps 832(%rdi), %xmm2 6179; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6180; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2 6181; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 6182; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 6183; AVX2-FP-NEXT: vmovaps 992(%rdi), %xmm1 6184; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6185; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1 6186; AVX2-FP-NEXT: vmovaps 960(%rdi), %xmm11 6187; AVX2-FP-NEXT: vbroadcastss %xmm11, %xmm2 6188; AVX2-FP-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6189; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 6190; AVX2-FP-NEXT: vmovaps 928(%rdi), %xmm2 6191; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6192; AVX2-FP-NEXT: vmovaps 896(%rdi), %xmm3 6193; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6194; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 6195; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 6196; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6197; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 6198; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6199; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6200; AVX2-FP-NEXT: vmovaps 608(%rdi), %xmm0 6201; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6202; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm0 6203; AVX2-FP-NEXT: vmovaps 576(%rdi), %xmm1 6204; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6205; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1 6206; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6207; AVX2-FP-NEXT: vmovaps 544(%rdi), %xmm1 6208; AVX2-FP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill 6209; AVX2-FP-NEXT: vmovaps 512(%rdi), %xmm2 6210; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6211; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 6212; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 6213; AVX2-FP-NEXT: vmovaps 736(%rdi), %xmm1 6214; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6215; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1 6216; AVX2-FP-NEXT: vmovaps 704(%rdi), %xmm2 6217; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6218; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2 6219; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 6220; AVX2-FP-NEXT: vmovaps 672(%rdi), %xmm2 6221; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6222; AVX2-FP-NEXT: vmovaps 640(%rdi), %xmm3 6223; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6224; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 6225; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 6226; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6227; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 6228; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6229; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6230; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm0 6231; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6232; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm0 6233; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm1 6234; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6235; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1 6236; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6237; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm9 6238; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm8 6239; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] 6240; AVX2-FP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6241; AVX2-FP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6242; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6243; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6244; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7] 6245; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm7 6246; AVX2-FP-NEXT: vbroadcastss %xmm7, %xmm1 6247; AVX2-FP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6248; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm6 6249; AVX2-FP-NEXT: vbroadcastss %xmm6, %xmm2 6250; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6251; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 6252; AVX2-FP-NEXT: vmovaps (%rdi), %xmm5 6253; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm4 6254; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 6255; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6256; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6257; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3] 6258; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7] 6259; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6260; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1] 6261; AVX2-FP-NEXT: vmovaps %xmm10, %xmm3 6262; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 6263; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3] 6264; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 6265; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 6266; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6267; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 6268; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 6269; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6270; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1] 6271; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 6272; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 6273; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 6274; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 6275; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6276; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6277; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6278; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1] 6279; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 6280; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 6281; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 6282; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload 6283; AVX2-FP-NEXT: # xmm1 = xmm12[0],mem[0],xmm12[1],mem[1] 6284; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 6285; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload 6286; AVX2-FP-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] 6287; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6288; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 6289; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1] 6290; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 6291; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] 6292; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 6293; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 6294; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6295; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6296; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] 6297; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] 6298; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 6299; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 6300; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6301; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 6302; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 6303; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6304; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1] 6305; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] 6306; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 6307; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 6308; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6309; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6310; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 6311; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] 6312; AVX2-FP-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload 6313; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] 6314; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 6315; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6316; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] 6317; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 6318; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 6319; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6320; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 6321; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6322; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 6323; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1] 6324; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6325; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3] 6326; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 6327; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 6328; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6329; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6330; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] 6331; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6332; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6333; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 6334; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 6335; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6336; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2] 6337; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] 6338; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6339; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6340; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 6341; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 6342; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2] 6343; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6344; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] 6345; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] 6346; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 6347; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6348; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6349; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload 6350; AVX2-FP-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] 6351; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6352; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] 6353; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6354; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 6355; AVX2-FP-NEXT: # xmm15 = mem[2,2,2,2] 6356; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6357; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] 6358; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 6359; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 6360; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 6361; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6362; AVX2-FP-NEXT: vshufps {{.*#+}} xmm15 = xmm0[2,2,2,2] 6363; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm12[0,1,2],xmm15[3] 6364; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm3[0,1],xmm15[2,3] 6365; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] 6366; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6367; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm14 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] 6368; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm12 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] 6369; AVX2-FP-NEXT: vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2] 6370; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 6371; AVX2-FP-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] 6372; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 6373; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm15 6374; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] 6375; AVX2-FP-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2] 6376; AVX2-FP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload 6377; AVX2-FP-NEXT: # xmm15 = mem[0,1,2],xmm15[3] 6378; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3] 6379; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] 6380; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6381; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 6382; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload 6383; AVX2-FP-NEXT: # xmm9 = xmm3[2],mem[2],xmm3[3],mem[3] 6384; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 6385; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm15[2,2,2,2] 6386; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 6387; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] 6388; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 6389; AVX2-FP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm11 6390; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7] 6391; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 6392; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload 6393; AVX2-FP-NEXT: # xmm6 = xmm3[2],mem[2],xmm3[3],mem[3] 6394; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 6395; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,2,2] 6396; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 6397; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] 6398; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] 6399; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] 6400; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6401; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] 6402; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6403; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] 6404; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 6405; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload 6406; AVX2-FP-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] 6407; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 6408; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 6409; AVX2-FP-NEXT: # xmm3 = mem[2,3,2,3] 6410; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 6411; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] 6412; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 6413; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6414; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6415; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 6416; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6417; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 6418; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload 6419; AVX2-FP-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] 6420; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 6421; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 6422; AVX2-FP-NEXT: # xmm1 = mem[2,3,2,3] 6423; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6424; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] 6425; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6426; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6427; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6428; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 6429; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 6430; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] 6431; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload 6432; AVX2-FP-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] 6433; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6434; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3] 6435; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 6436; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 6437; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6438; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6439; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm15[2],xmm4[3],xmm15[3] 6440; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6441; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3] 6442; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6443; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 6444; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] 6445; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm1[1] 6446; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 6447; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6448; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm0 6449; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6450; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm1 6451; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6452; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 6453; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 6454; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm1 6455; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6456; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm2 6457; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6458; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 6459; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,2,2,2] 6460; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 6461; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm2 6462; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6463; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm3 6464; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6465; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm8 6466; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm1 6467; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6468; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5] 6469; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 6470; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] 6471; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6472; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6473; AVX2-FP-NEXT: vmovaps 800(%rdi), %ymm0 6474; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6475; AVX2-FP-NEXT: vmovaps 768(%rdi), %ymm1 6476; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6477; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 6478; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 6479; AVX2-FP-NEXT: vmovaps 864(%rdi), %ymm1 6480; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6481; AVX2-FP-NEXT: vmovaps 832(%rdi), %ymm2 6482; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6483; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 6484; AVX2-FP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill 6485; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 6486; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 6487; AVX2-FP-NEXT: vmovaps 992(%rdi), %ymm2 6488; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6489; AVX2-FP-NEXT: vmovaps 960(%rdi), %ymm3 6490; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6491; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm7 6492; AVX2-FP-NEXT: vmovaps 896(%rdi), %ymm1 6493; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6494; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] 6495; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6496; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 6497; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] 6498; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6499; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6500; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0 6501; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6502; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1 6503; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6504; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 6505; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 6506; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm1 6507; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6508; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm2 6509; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6510; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 6511; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] 6512; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 6513; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm2 6514; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6515; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm3 6516; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6517; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm15 6518; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm1 6519; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6520; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] 6521; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 6522; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] 6523; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6524; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6525; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm14 6526; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm11 6527; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5] 6528; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6529; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 6530; AVX2-FP-NEXT: vmovaps 608(%rdi), %ymm1 6531; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6532; AVX2-FP-NEXT: vmovaps 576(%rdi), %ymm2 6533; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6534; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 6535; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[2,2,2,2] 6536; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm5[2,3] 6537; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm2 6538; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6539; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm3 6540; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6541; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm5 6542; AVX2-FP-NEXT: vmovaps 640(%rdi), %ymm0 6543; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6544; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] 6545; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 6546; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] 6547; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 6548; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6549; AVX2-FP-NEXT: vbroadcastss 404(%rdi), %ymm0 6550; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] 6551; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] 6552; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm1 6553; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6554; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm6[1,1,1,1,5,5,5,5] 6555; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6556; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm3[1],ymm10[2,3,4],ymm3[5],ymm10[6,7] 6557; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm10 6558; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] 6559; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 6560; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6561; AVX2-FP-NEXT: vbroadcastss 916(%rdi), %ymm0 6562; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] 6563; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] 6564; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload 6565; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 6566; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 6567; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5] 6568; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 6569; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] 6570; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm7 6571; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] 6572; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 6573; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 6574; AVX2-FP-NEXT: vbroadcastss 148(%rdi), %ymm0 6575; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] 6576; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] 6577; AVX2-FP-NEXT: vextractf128 $1, %ymm13, %xmm1 6578; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 6579; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5] 6580; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 6581; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7] 6582; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4 6583; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] 6584; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 6585; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6586; AVX2-FP-NEXT: vbroadcastss 660(%rdi), %ymm0 6587; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] 6588; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 6589; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm1 6590; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5] 6591; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] 6592; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 6593; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 6594; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 6595; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6596; AVX2-FP-NEXT: vbroadcastss 504(%rdi), %ymm0 6597; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 6598; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7] 6599; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6600; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] 6601; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] 6602; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6603; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6604; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload 6605; AVX2-FP-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] 6606; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm2 6607; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm6[2,2,2,2] 6608; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] 6609; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] 6610; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 6611; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6612; AVX2-FP-NEXT: vbroadcastss 1016(%rdi), %ymm1 6613; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 6614; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7] 6615; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6616; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload 6617; AVX2-FP-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] 6618; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] 6619; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6620; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload 6621; AVX2-FP-NEXT: # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] 6622; AVX2-FP-NEXT: vextractf128 $1, %ymm12, %xmm2 6623; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm10[2,2,2,2] 6624; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] 6625; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] 6626; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 6627; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6628; AVX2-FP-NEXT: vbroadcastss 248(%rdi), %ymm2 6629; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 6630; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] 6631; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6632; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] 6633; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] 6634; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6635; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload 6636; AVX2-FP-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 6637; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm8 6638; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] 6639; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] 6640; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] 6641; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7] 6642; AVX2-FP-NEXT: vbroadcastss 760(%rdi), %ymm2 6643; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 6644; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] 6645; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6646; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] 6647; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 6648; AVX2-FP-NEXT: # ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] 6649; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6650; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload 6651; AVX2-FP-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 6652; AVX2-FP-NEXT: vextractf128 $1, %ymm11, %xmm15 6653; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] 6654; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] 6655; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] 6656; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] 6657; AVX2-FP-NEXT: vbroadcastss 220(%rdi), %ymm1 6658; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 6659; AVX2-FP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 6660; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] 6661; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm4 6662; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] 6663; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm9 6664; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] 6665; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7] 6666; AVX2-FP-NEXT: vbroadcastss 476(%rdi), %ymm1 6667; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 6668; AVX2-FP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 6669; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 6670; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm1 6671; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 6672; AVX2-FP-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] 6673; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 6674; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] 6675; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 6676; AVX2-FP-NEXT: vbroadcastss 732(%rdi), %ymm1 6677; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 6678; AVX2-FP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 6679; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] 6680; AVX2-FP-NEXT: vextractf128 $1, %ymm13, %xmm5 6681; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7] 6682; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 6683; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] 6684; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7] 6685; AVX2-FP-NEXT: vbroadcastss 988(%rdi), %ymm1 6686; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 6687; AVX2-FP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 6688; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] 6689; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm3 6690; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm12[2,3,2,3,6,7,6,7] 6691; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 6692; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] 6693; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] 6694; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6695; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rsi) 6696; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6697; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi) 6698; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6699; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rsi) 6700; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6701; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi) 6702; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6703; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rdx) 6704; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6705; AVX2-FP-NEXT: vmovaps %ymm3, (%rdx) 6706; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6707; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rdx) 6708; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6709; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rdx) 6710; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6711; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rcx) 6712; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6713; AVX2-FP-NEXT: vmovaps %ymm3, (%rcx) 6714; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6715; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rcx) 6716; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6717; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rcx) 6718; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6719; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r8) 6720; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6721; AVX2-FP-NEXT: vmovaps %ymm3, (%r8) 6722; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6723; AVX2-FP-NEXT: vmovaps %ymm3, 96(%r8) 6724; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6725; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r8) 6726; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6727; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r9) 6728; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6729; AVX2-FP-NEXT: vmovaps %ymm3, (%r9) 6730; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6731; AVX2-FP-NEXT: vmovaps %ymm3, 96(%r9) 6732; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6733; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r9) 6734; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6735; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6736; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rax) 6737; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6738; AVX2-FP-NEXT: vmovaps %ymm3, (%rax) 6739; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload 6740; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rax) 6741; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6742; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rax) 6743; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6744; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rax) 6745; AVX2-FP-NEXT: vmovaps %ymm8, (%rax) 6746; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6747; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rax) 6748; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6749; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rax) 6750; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6751; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rax) 6752; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rax) 6753; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) 6754; AVX2-FP-NEXT: vmovaps %ymm4, (%rax) 6755; AVX2-FP-NEXT: addq $1544, %rsp # imm = 0x608 6756; AVX2-FP-NEXT: vzeroupper 6757; AVX2-FP-NEXT: retq 6758; 6759; AVX2-FCP-LABEL: load_i32_stride8_vf32: 6760; AVX2-FCP: # %bb.0: 6761; AVX2-FCP-NEXT: subq $1544, %rsp # imm = 0x608 6762; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm0 6763; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6764; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm10 6765; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] 6766; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm14 6767; AVX2-FCP-NEXT: vbroadcastss %xmm14, %xmm1 6768; AVX2-FCP-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6769; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm15 6770; AVX2-FCP-NEXT: vbroadcastss %xmm15, %xmm2 6771; AVX2-FCP-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6772; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 6773; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 6774; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm1 6775; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6776; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm12 6777; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] 6778; AVX2-FCP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6779; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6780; AVX2-FCP-NEXT: vmovaps 480(%rdi), %xmm2 6781; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6782; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2 6783; AVX2-FCP-NEXT: vmovaps 448(%rdi), %xmm3 6784; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6785; AVX2-FCP-NEXT: vbroadcastss %xmm3, %xmm3 6786; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 6787; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 6788; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] 6789; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6790; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6791; AVX2-FCP-NEXT: vmovaps 800(%rdi), %xmm1 6792; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6793; AVX2-FCP-NEXT: vmovaps 768(%rdi), %xmm0 6794; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6795; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 6796; AVX2-FCP-NEXT: vmovaps 864(%rdi), %xmm1 6797; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6798; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1 6799; AVX2-FCP-NEXT: vmovaps 832(%rdi), %xmm2 6800; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6801; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2 6802; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 6803; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 6804; AVX2-FCP-NEXT: vmovaps 992(%rdi), %xmm1 6805; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6806; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1 6807; AVX2-FCP-NEXT: vmovaps 960(%rdi), %xmm11 6808; AVX2-FCP-NEXT: vbroadcastss %xmm11, %xmm2 6809; AVX2-FCP-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6810; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 6811; AVX2-FCP-NEXT: vmovaps 928(%rdi), %xmm2 6812; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6813; AVX2-FCP-NEXT: vmovaps 896(%rdi), %xmm3 6814; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6815; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 6816; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 6817; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6818; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 6819; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6820; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6821; AVX2-FCP-NEXT: vmovaps 608(%rdi), %xmm0 6822; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6823; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm0 6824; AVX2-FCP-NEXT: vmovaps 576(%rdi), %xmm1 6825; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6826; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1 6827; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6828; AVX2-FCP-NEXT: vmovaps 544(%rdi), %xmm1 6829; AVX2-FCP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill 6830; AVX2-FCP-NEXT: vmovaps 512(%rdi), %xmm2 6831; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6832; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 6833; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 6834; AVX2-FCP-NEXT: vmovaps 736(%rdi), %xmm1 6835; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6836; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1 6837; AVX2-FCP-NEXT: vmovaps 704(%rdi), %xmm2 6838; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6839; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2 6840; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 6841; AVX2-FCP-NEXT: vmovaps 672(%rdi), %xmm2 6842; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6843; AVX2-FCP-NEXT: vmovaps 640(%rdi), %xmm3 6844; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6845; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 6846; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 6847; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6848; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 6849; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6850; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6851; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm0 6852; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6853; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm0 6854; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm1 6855; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6856; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1 6857; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6858; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm9 6859; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm8 6860; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] 6861; AVX2-FCP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6862; AVX2-FCP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6863; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6864; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6865; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7] 6866; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm7 6867; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm1 6868; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6869; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm6 6870; AVX2-FCP-NEXT: vbroadcastss %xmm6, %xmm2 6871; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6872; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 6873; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm5 6874; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm4 6875; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 6876; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6877; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6878; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3] 6879; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7] 6880; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6881; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1] 6882; AVX2-FCP-NEXT: vmovaps %xmm10, %xmm3 6883; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 6884; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3] 6885; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 6886; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 6887; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6888; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 6889; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 6890; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6891; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1] 6892; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 6893; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 6894; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 6895; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 6896; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6897; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6898; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6899; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1] 6900; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 6901; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 6902; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 6903; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload 6904; AVX2-FCP-NEXT: # xmm1 = xmm12[0],mem[0],xmm12[1],mem[1] 6905; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 6906; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload 6907; AVX2-FCP-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] 6908; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6909; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 6910; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1] 6911; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 6912; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] 6913; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 6914; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 6915; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6916; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6917; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] 6918; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] 6919; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 6920; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 6921; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6922; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 6923; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 6924; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6925; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1] 6926; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] 6927; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 6928; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 6929; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6930; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6931; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 6932; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] 6933; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload 6934; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] 6935; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 6936; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6937; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] 6938; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 6939; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 6940; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6941; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 6942; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6943; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 6944; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1] 6945; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6946; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3] 6947; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 6948; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 6949; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6950; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6951; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] 6952; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6953; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6954; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 6955; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 6956; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6957; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2] 6958; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] 6959; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6960; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6961; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 6962; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 6963; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2] 6964; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6965; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] 6966; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] 6967; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 6968; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6969; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6970; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload 6971; AVX2-FCP-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] 6972; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6973; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] 6974; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6975; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 6976; AVX2-FCP-NEXT: # xmm15 = mem[2,2,2,2] 6977; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6978; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] 6979; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 6980; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 6981; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 6982; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6983; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm15 = xmm0[2,2,2,2] 6984; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm12[0,1,2],xmm15[3] 6985; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm3[0,1],xmm15[2,3] 6986; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] 6987; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6988; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm14 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] 6989; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm12 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] 6990; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2] 6991; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 6992; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] 6993; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 6994; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm15 6995; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] 6996; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2] 6997; AVX2-FCP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload 6998; AVX2-FCP-NEXT: # xmm15 = mem[0,1,2],xmm15[3] 6999; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3] 7000; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] 7001; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7002; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7003; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload 7004; AVX2-FCP-NEXT: # xmm9 = xmm3[2],mem[2],xmm3[3],mem[3] 7005; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 7006; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm7 = xmm15[2,2,2,2] 7007; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 7008; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] 7009; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 7010; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm11 7011; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7] 7012; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7013; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload 7014; AVX2-FCP-NEXT: # xmm6 = xmm3[2],mem[2],xmm3[3],mem[3] 7015; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 7016; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,2,2] 7017; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 7018; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] 7019; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] 7020; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] 7021; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7022; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] 7023; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7024; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] 7025; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7026; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload 7027; AVX2-FCP-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] 7028; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 7029; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7030; AVX2-FCP-NEXT: # xmm3 = mem[2,3,2,3] 7031; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 7032; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] 7033; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 7034; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7035; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7036; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 7037; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7038; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 7039; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload 7040; AVX2-FCP-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] 7041; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 7042; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7043; AVX2-FCP-NEXT: # xmm1 = mem[2,3,2,3] 7044; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 7045; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] 7046; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7047; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7048; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7049; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 7050; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 7051; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] 7052; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload 7053; AVX2-FCP-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] 7054; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 7055; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3] 7056; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 7057; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 7058; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7059; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7060; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm15[2],xmm4[3],xmm15[3] 7061; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 7062; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3] 7063; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 7064; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 7065; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] 7066; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm1[1] 7067; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 7068; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7069; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm0 7070; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7071; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm1 7072; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7073; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 7074; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 7075; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm1 7076; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7077; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm2 7078; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7079; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 7080; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,2,2,2] 7081; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 7082; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm2 7083; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7084; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm3 7085; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7086; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm8 7087; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm1 7088; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7089; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5] 7090; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 7091; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] 7092; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7093; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7094; AVX2-FCP-NEXT: vmovaps 800(%rdi), %ymm0 7095; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7096; AVX2-FCP-NEXT: vmovaps 768(%rdi), %ymm1 7097; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7098; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 7099; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 7100; AVX2-FCP-NEXT: vmovaps 864(%rdi), %ymm1 7101; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7102; AVX2-FCP-NEXT: vmovaps 832(%rdi), %ymm2 7103; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7104; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 7105; AVX2-FCP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill 7106; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 7107; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 7108; AVX2-FCP-NEXT: vmovaps 992(%rdi), %ymm2 7109; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7110; AVX2-FCP-NEXT: vmovaps 960(%rdi), %ymm3 7111; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7112; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm7 7113; AVX2-FCP-NEXT: vmovaps 896(%rdi), %ymm1 7114; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7115; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] 7116; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7117; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 7118; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] 7119; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7120; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7121; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0 7122; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7123; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1 7124; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7125; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 7126; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 7127; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1 7128; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7129; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm2 7130; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7131; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 7132; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] 7133; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 7134; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2 7135; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7136; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm3 7137; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7138; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm15 7139; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm1 7140; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7141; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] 7142; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 7143; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] 7144; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7145; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7146; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm14 7147; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm11 7148; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5] 7149; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7150; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 7151; AVX2-FCP-NEXT: vmovaps 608(%rdi), %ymm1 7152; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7153; AVX2-FCP-NEXT: vmovaps 576(%rdi), %ymm2 7154; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7155; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 7156; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[2,2,2,2] 7157; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm5[2,3] 7158; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm2 7159; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7160; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm3 7161; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7162; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm5 7163; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm0 7164; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7165; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] 7166; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 7167; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] 7168; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 7169; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7170; AVX2-FCP-NEXT: vbroadcastss 404(%rdi), %ymm0 7171; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] 7172; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] 7173; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm1 7174; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 7175; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm6[1,1,1,1,5,5,5,5] 7176; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7177; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm3[1],ymm10[2,3,4],ymm3[5],ymm10[6,7] 7178; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm10 7179; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] 7180; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 7181; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7182; AVX2-FCP-NEXT: vbroadcastss 916(%rdi), %ymm0 7183; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] 7184; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] 7185; AVX2-FCP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload 7186; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 7187; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 7188; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5] 7189; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 7190; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] 7191; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm7 7192; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] 7193; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 7194; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 7195; AVX2-FCP-NEXT: vbroadcastss 148(%rdi), %ymm0 7196; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] 7197; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] 7198; AVX2-FCP-NEXT: vextractf128 $1, %ymm13, %xmm1 7199; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 7200; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5] 7201; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 7202; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7] 7203; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm4 7204; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] 7205; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 7206; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7207; AVX2-FCP-NEXT: vbroadcastss 660(%rdi), %ymm0 7208; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] 7209; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 7210; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm1 7211; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5] 7212; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] 7213; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 7214; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 7215; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 7216; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7217; AVX2-FCP-NEXT: vbroadcastss 504(%rdi), %ymm0 7218; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 7219; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7] 7220; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7221; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] 7222; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] 7223; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7224; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7225; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload 7226; AVX2-FCP-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] 7227; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm2 7228; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm6[2,2,2,2] 7229; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] 7230; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] 7231; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 7232; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7233; AVX2-FCP-NEXT: vbroadcastss 1016(%rdi), %ymm1 7234; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 7235; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7] 7236; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7237; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload 7238; AVX2-FCP-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] 7239; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] 7240; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7241; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload 7242; AVX2-FCP-NEXT: # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] 7243; AVX2-FCP-NEXT: vextractf128 $1, %ymm12, %xmm2 7244; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm10[2,2,2,2] 7245; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] 7246; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] 7247; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 7248; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7249; AVX2-FCP-NEXT: vbroadcastss 248(%rdi), %ymm2 7250; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 7251; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] 7252; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7253; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] 7254; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] 7255; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7256; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload 7257; AVX2-FCP-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 7258; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm8 7259; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] 7260; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] 7261; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] 7262; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7] 7263; AVX2-FCP-NEXT: vbroadcastss 760(%rdi), %ymm2 7264; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 7265; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] 7266; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7267; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] 7268; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 7269; AVX2-FCP-NEXT: # ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] 7270; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7271; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload 7272; AVX2-FCP-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 7273; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm15 7274; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] 7275; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] 7276; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] 7277; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] 7278; AVX2-FCP-NEXT: vbroadcastss 220(%rdi), %ymm1 7279; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 7280; AVX2-FCP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 7281; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] 7282; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm4 7283; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] 7284; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm9 7285; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] 7286; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7] 7287; AVX2-FCP-NEXT: vbroadcastss 476(%rdi), %ymm1 7288; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 7289; AVX2-FCP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 7290; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 7291; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm1 7292; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 7293; AVX2-FCP-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] 7294; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 7295; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] 7296; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 7297; AVX2-FCP-NEXT: vbroadcastss 732(%rdi), %ymm1 7298; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 7299; AVX2-FCP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 7300; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] 7301; AVX2-FCP-NEXT: vextractf128 $1, %ymm13, %xmm5 7302; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7] 7303; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 7304; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] 7305; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7] 7306; AVX2-FCP-NEXT: vbroadcastss 988(%rdi), %ymm1 7307; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 7308; AVX2-FCP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 7309; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] 7310; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm3 7311; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm12[2,3,2,3,6,7,6,7] 7312; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 7313; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] 7314; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] 7315; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7316; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rsi) 7317; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7318; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi) 7319; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7320; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rsi) 7321; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7322; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi) 7323; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7324; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rdx) 7325; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7326; AVX2-FCP-NEXT: vmovaps %ymm3, (%rdx) 7327; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7328; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rdx) 7329; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7330; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx) 7331; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7332; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rcx) 7333; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7334; AVX2-FCP-NEXT: vmovaps %ymm3, (%rcx) 7335; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7336; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rcx) 7337; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7338; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rcx) 7339; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7340; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r8) 7341; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7342; AVX2-FCP-NEXT: vmovaps %ymm3, (%r8) 7343; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7344; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%r8) 7345; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7346; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%r8) 7347; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7348; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r9) 7349; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7350; AVX2-FCP-NEXT: vmovaps %ymm3, (%r9) 7351; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7352; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%r9) 7353; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7354; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%r9) 7355; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 7356; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7357; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rax) 7358; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7359; AVX2-FCP-NEXT: vmovaps %ymm3, (%rax) 7360; AVX2-FCP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload 7361; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rax) 7362; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7363; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rax) 7364; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 7365; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rax) 7366; AVX2-FCP-NEXT: vmovaps %ymm8, (%rax) 7367; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7368; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rax) 7369; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7370; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rax) 7371; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 7372; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rax) 7373; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rax) 7374; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) 7375; AVX2-FCP-NEXT: vmovaps %ymm4, (%rax) 7376; AVX2-FCP-NEXT: addq $1544, %rsp # imm = 0x608 7377; AVX2-FCP-NEXT: vzeroupper 7378; AVX2-FCP-NEXT: retq 7379; 7380; AVX512-LABEL: load_i32_stride8_vf32: 7381; AVX512: # %bb.0: 7382; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 7383; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 7384; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 7385; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 7386; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm29 7387; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 7388; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm30 7389; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm31 7390; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm3 7391; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm7 7392; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm6 7393; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm9 7394; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm5 7395; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm12 7396; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm2 7397; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm14 7398; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm11 7399; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm16 7400; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm15 7401; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 7402; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7403; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 7404; AVX512-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 7405; AVX512-NEXT: vmovdqa64 %zmm11, %zmm18 7406; AVX512-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 7407; AVX512-NEXT: movb $-64, %dil 7408; AVX512-NEXT: kmovw %edi, %k1 7409; AVX512-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} 7410; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 7411; AVX512-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 7412; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 7413; AVX512-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 7414; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] 7415; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 7416; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 7417; AVX512-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 7418; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 7419; AVX512-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 7420; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7421; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 7422; AVX512-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 7423; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 7424; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7425; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 7426; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 7427; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7428; AVX512-NEXT: vmovdqa64 %zmm15, %zmm10 7429; AVX512-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 7430; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 7431; AVX512-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 7432; AVX512-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} 7433; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 7434; AVX512-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 7435; AVX512-NEXT: vmovdqa64 %zmm5, %zmm4 7436; AVX512-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 7437; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 7438; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 7439; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 7440; AVX512-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 7441; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 7442; AVX512-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 7443; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} 7444; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 7445; AVX512-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 7446; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 7447; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] 7448; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 7449; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 7450; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7451; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 7452; AVX512-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 7453; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 7454; AVX512-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 7455; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7456; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 7457; AVX512-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 7458; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 7459; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 7460; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7461; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 7462; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 7463; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 7464; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 7465; AVX512-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 7466; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7467; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 7468; AVX512-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 7469; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 7470; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 7471; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 7472; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 7473; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7474; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 7475; AVX512-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 7476; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 7477; AVX512-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 7478; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7479; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 7480; AVX512-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 7481; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 7482; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 7483; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7484; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 7485; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 7486; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 7487; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 7488; AVX512-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 7489; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7490; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 7491; AVX512-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 7492; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 7493; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 7494; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 7495; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 7496; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7497; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 7498; AVX512-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 7499; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 7500; AVX512-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 7501; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7502; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 7503; AVX512-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 7504; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 7505; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 7506; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7507; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 7508; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 7509; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 7510; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 7511; AVX512-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 7512; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7513; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 7514; AVX512-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 7515; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 7516; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 7517; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 7518; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 7519; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7520; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 7521; AVX512-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 7522; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 7523; AVX512-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 7524; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7525; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 7526; AVX512-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 7527; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 7528; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 7529; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7530; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 7531; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 7532; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 7533; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 7534; AVX512-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 7535; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7536; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 7537; AVX512-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 7538; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 7539; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 7540; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 7541; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 7542; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7543; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 7544; AVX512-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 7545; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 7546; AVX512-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 7547; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7548; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 7549; AVX512-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 7550; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 7551; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 7552; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7553; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 7554; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10 7555; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 7556; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 7557; AVX512-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 7558; AVX512-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} 7559; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 7560; AVX512-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 7561; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 7562; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 7563; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 7564; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 7565; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7566; AVX512-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 7567; AVX512-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 7568; AVX512-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} 7569; AVX512-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 7570; AVX512-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 7571; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 7572; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 7573; AVX512-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 7574; AVX512-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 7575; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} 7576; AVX512-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 7577; AVX512-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 7578; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7579; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 7580; AVX512-NEXT: vmovdqa64 %zmm28, 64(%rsi) 7581; AVX512-NEXT: vmovdqa64 %zmm17, (%rsi) 7582; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rdx) 7583; AVX512-NEXT: vmovdqa64 %zmm19, (%rdx) 7584; AVX512-NEXT: vmovdqa64 %zmm20, 64(%rcx) 7585; AVX512-NEXT: vmovdqa64 %zmm21, (%rcx) 7586; AVX512-NEXT: vmovdqa64 %zmm22, 64(%r8) 7587; AVX512-NEXT: vmovdqa64 %zmm23, (%r8) 7588; AVX512-NEXT: vmovdqa64 %zmm24, 64(%r9) 7589; AVX512-NEXT: vmovdqa64 %zmm25, (%r9) 7590; AVX512-NEXT: vmovdqa64 %zmm26, 64(%r11) 7591; AVX512-NEXT: vmovdqa64 %zmm27, (%r11) 7592; AVX512-NEXT: vmovdqa64 %zmm8, 64(%r10) 7593; AVX512-NEXT: vmovdqa64 %zmm4, (%r10) 7594; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax) 7595; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) 7596; AVX512-NEXT: vzeroupper 7597; AVX512-NEXT: retq 7598; 7599; AVX512-FCP-LABEL: load_i32_stride8_vf32: 7600; AVX512-FCP: # %bb.0: 7601; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 7602; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 7603; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 7604; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 7605; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 7606; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 7607; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 7608; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31 7609; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 7610; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 7611; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 7612; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 7613; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 7614; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 7615; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 7616; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 7617; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 7618; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16 7619; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15 7620; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 7621; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7622; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 7623; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 7624; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 7625; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 7626; AVX512-FCP-NEXT: movb $-64, %dil 7627; AVX512-FCP-NEXT: kmovw %edi, %k1 7628; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} 7629; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 7630; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 7631; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 7632; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 7633; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] 7634; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 7635; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 7636; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 7637; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 7638; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 7639; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7640; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 7641; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 7642; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 7643; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7644; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 7645; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 7646; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7647; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 7648; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 7649; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 7650; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 7651; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} 7652; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 7653; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 7654; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 7655; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 7656; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 7657; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 7658; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 7659; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 7660; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 7661; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 7662; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} 7663; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 7664; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 7665; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 7666; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] 7667; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 7668; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 7669; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7670; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 7671; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 7672; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 7673; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 7674; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7675; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 7676; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 7677; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 7678; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 7679; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7680; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 7681; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 7682; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 7683; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 7684; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 7685; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7686; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 7687; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 7688; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 7689; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 7690; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 7691; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 7692; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7693; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 7694; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 7695; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 7696; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 7697; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7698; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 7699; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 7700; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 7701; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 7702; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7703; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 7704; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 7705; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 7706; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 7707; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 7708; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7709; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 7710; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 7711; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 7712; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 7713; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 7714; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 7715; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7716; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 7717; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 7718; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 7719; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 7720; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7721; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 7722; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 7723; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 7724; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 7725; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7726; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 7727; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 7728; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 7729; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 7730; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 7731; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7732; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 7733; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 7734; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 7735; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 7736; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 7737; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 7738; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7739; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 7740; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 7741; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 7742; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 7743; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7744; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 7745; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 7746; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 7747; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 7748; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7749; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 7750; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 7751; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 7752; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 7753; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 7754; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7755; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 7756; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 7757; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 7758; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 7759; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 7760; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 7761; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7762; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 7763; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 7764; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 7765; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 7766; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7767; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 7768; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 7769; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 7770; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 7771; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7772; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 7773; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 7774; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 7775; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 7776; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 7777; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} 7778; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 7779; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 7780; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 7781; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 7782; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 7783; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 7784; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7785; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 7786; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 7787; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} 7788; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 7789; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 7790; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 7791; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 7792; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 7793; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 7794; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} 7795; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 7796; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 7797; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7798; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 7799; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi) 7800; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) 7801; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx) 7802; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) 7803; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx) 7804; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) 7805; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) 7806; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r8) 7807; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9) 7808; AVX512-FCP-NEXT: vmovdqa64 %zmm25, (%r9) 7809; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11) 7810; AVX512-FCP-NEXT: vmovdqa64 %zmm27, (%r11) 7811; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10) 7812; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r10) 7813; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) 7814; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 7815; AVX512-FCP-NEXT: vzeroupper 7816; AVX512-FCP-NEXT: retq 7817; 7818; AVX512DQ-LABEL: load_i32_stride8_vf32: 7819; AVX512DQ: # %bb.0: 7820; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 7821; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 7822; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 7823; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 7824; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm29 7825; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1 7826; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm30 7827; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm31 7828; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm3 7829; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm7 7830; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm6 7831; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm9 7832; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm5 7833; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm12 7834; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm2 7835; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm14 7836; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm11 7837; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm16 7838; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm15 7839; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 7840; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7841; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm17 7842; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 7843; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm18 7844; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 7845; AVX512DQ-NEXT: movb $-64, %dil 7846; AVX512DQ-NEXT: kmovw %edi, %k1 7847; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} 7848; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 7849; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 7850; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm8 7851; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 7852; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] 7853; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 7854; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 7855; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 7856; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 7857; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 7858; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7859; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 7860; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 7861; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 7862; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7863; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 7864; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 7865; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7866; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm10 7867; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 7868; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm13 7869; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 7870; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} 7871; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 7872; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 7873; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm4 7874; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 7875; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 7876; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 7877; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm4 7878; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 7879; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 7880; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 7881; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} 7882; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 7883; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 7884; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 7885; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] 7886; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 7887; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 7888; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7889; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8 7890; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 7891; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10 7892; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 7893; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7894; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 7895; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 7896; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 7897; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 7898; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7899; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 7900; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 7901; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 7902; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 7903; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 7904; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7905; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 7906; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 7907; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 7908; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 7909; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 7910; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 7911; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7912; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8 7913; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 7914; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10 7915; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 7916; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7917; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 7918; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 7919; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 7920; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 7921; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7922; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 7923; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 7924; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 7925; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 7926; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 7927; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7928; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 7929; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 7930; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 7931; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 7932; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 7933; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 7934; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7935; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8 7936; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 7937; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10 7938; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 7939; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7940; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 7941; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 7942; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 7943; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 7944; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7945; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 7946; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 7947; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 7948; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 7949; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 7950; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7951; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 7952; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 7953; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 7954; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 7955; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 7956; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 7957; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7958; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8 7959; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 7960; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10 7961; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 7962; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7963; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 7964; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 7965; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 7966; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 7967; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7968; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 7969; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 7970; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 7971; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 7972; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 7973; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7974; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 7975; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 7976; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 7977; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 7978; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 7979; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 7980; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7981; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8 7982; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 7983; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10 7984; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 7985; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 7986; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 7987; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 7988; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 7989; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 7990; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 7991; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 7992; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10 7993; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 7994; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 7995; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 7996; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} 7997; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 7998; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 7999; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8000; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 8001; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 8002; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 8003; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8004; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 8005; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 8006; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} 8007; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 8008; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 8009; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 8010; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 8011; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 8012; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 8013; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} 8014; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 8015; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 8016; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8017; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 8018; AVX512DQ-NEXT: vmovdqa64 %zmm28, 64(%rsi) 8019; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rsi) 8020; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%rdx) 8021; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rdx) 8022; AVX512DQ-NEXT: vmovdqa64 %zmm20, 64(%rcx) 8023; AVX512DQ-NEXT: vmovdqa64 %zmm21, (%rcx) 8024; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%r8) 8025; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%r8) 8026; AVX512DQ-NEXT: vmovdqa64 %zmm24, 64(%r9) 8027; AVX512DQ-NEXT: vmovdqa64 %zmm25, (%r9) 8028; AVX512DQ-NEXT: vmovdqa64 %zmm26, 64(%r11) 8029; AVX512DQ-NEXT: vmovdqa64 %zmm27, (%r11) 8030; AVX512DQ-NEXT: vmovdqa64 %zmm8, 64(%r10) 8031; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%r10) 8032; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax) 8033; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) 8034; AVX512DQ-NEXT: vzeroupper 8035; AVX512DQ-NEXT: retq 8036; 8037; AVX512DQ-FCP-LABEL: load_i32_stride8_vf32: 8038; AVX512DQ-FCP: # %bb.0: 8039; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 8040; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 8041; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 8042; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 8043; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 8044; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 8045; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 8046; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31 8047; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 8048; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 8049; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 8050; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 8051; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 8052; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 8053; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 8054; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 8055; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 8056; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16 8057; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15 8058; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 8059; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8060; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 8061; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 8062; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 8063; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 8064; AVX512DQ-FCP-NEXT: movb $-64, %dil 8065; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 8066; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} 8067; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 8068; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 8069; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 8070; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 8071; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] 8072; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 8073; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 8074; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 8075; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 8076; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 8077; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8078; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 8079; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 8080; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 8081; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8082; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 8083; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 8084; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8085; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 8086; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 8087; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 8088; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 8089; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} 8090; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 8091; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 8092; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 8093; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 8094; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 8095; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 8096; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 8097; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 8098; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 8099; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 8100; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} 8101; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 8102; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 8103; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 8104; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] 8105; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 8106; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 8107; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8108; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 8109; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8110; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 8111; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8112; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8113; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 8114; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8115; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 8116; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8117; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8118; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 8119; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 8120; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 8121; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 8122; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 8123; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8124; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 8125; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 8126; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8127; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 8128; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 8129; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 8130; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8131; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 8132; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8133; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 8134; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8135; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8136; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 8137; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8138; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 8139; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8140; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8141; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 8142; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 8143; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 8144; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 8145; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 8146; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8147; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 8148; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 8149; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8150; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 8151; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 8152; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 8153; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8154; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 8155; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8156; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 8157; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8158; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8159; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 8160; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8161; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 8162; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8163; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8164; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 8165; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 8166; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 8167; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 8168; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 8169; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8170; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 8171; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 8172; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8173; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 8174; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 8175; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 8176; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8177; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 8178; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8179; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 8180; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8181; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8182; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 8183; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8184; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 8185; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8186; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8187; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 8188; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 8189; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 8190; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 8191; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 8192; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8193; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 8194; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 8195; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8196; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 8197; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 8198; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 8199; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8200; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 8201; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8202; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 8203; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8204; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8205; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 8206; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8207; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 8208; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8209; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8210; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 8211; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 8212; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 8213; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 8214; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 8215; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} 8216; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 8217; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 8218; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8219; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 8220; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 8221; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 8222; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8223; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 8224; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 8225; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} 8226; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 8227; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 8228; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 8229; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 8230; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 8231; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 8232; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} 8233; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 8234; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 8235; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8236; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 8237; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi) 8238; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) 8239; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx) 8240; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) 8241; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx) 8242; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) 8243; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) 8244; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%r8) 8245; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9) 8246; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, (%r9) 8247; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11) 8248; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, (%r11) 8249; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10) 8250; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r10) 8251; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) 8252; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 8253; AVX512DQ-FCP-NEXT: vzeroupper 8254; AVX512DQ-FCP-NEXT: retq 8255; 8256; AVX512BW-LABEL: load_i32_stride8_vf32: 8257; AVX512BW: # %bb.0: 8258; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 8259; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 8260; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 8261; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 8262; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29 8263; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 8264; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30 8265; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm31 8266; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 8267; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 8268; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 8269; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 8270; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5 8271; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm12 8272; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 8273; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm14 8274; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm11 8275; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm16 8276; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm15 8277; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 8278; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8279; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 8280; AVX512BW-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 8281; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18 8282; AVX512BW-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 8283; AVX512BW-NEXT: movb $-64, %dil 8284; AVX512BW-NEXT: kmovd %edi, %k1 8285; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} 8286; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 8287; AVX512BW-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 8288; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 8289; AVX512BW-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 8290; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] 8291; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 8292; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 8293; AVX512BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 8294; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 8295; AVX512BW-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 8296; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8297; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 8298; AVX512BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 8299; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 8300; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8301; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 8302; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 8303; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8304; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 8305; AVX512BW-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 8306; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 8307; AVX512BW-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 8308; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} 8309; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 8310; AVX512BW-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 8311; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 8312; AVX512BW-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 8313; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 8314; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 8315; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 8316; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 8317; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 8318; AVX512BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 8319; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} 8320; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 8321; AVX512BW-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 8322; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 8323; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] 8324; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 8325; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 8326; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8327; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 8328; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8329; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 8330; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8331; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8332; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 8333; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8334; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 8335; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8336; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8337; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 8338; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 8339; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 8340; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 8341; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 8342; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8343; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 8344; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 8345; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8346; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 8347; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 8348; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 8349; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8350; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 8351; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8352; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 8353; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8354; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8355; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 8356; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8357; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 8358; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8359; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8360; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 8361; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 8362; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 8363; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 8364; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 8365; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8366; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 8367; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 8368; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8369; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 8370; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 8371; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 8372; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8373; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 8374; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8375; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 8376; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8377; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8378; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 8379; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8380; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 8381; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8382; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8383; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 8384; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 8385; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 8386; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 8387; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 8388; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8389; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 8390; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 8391; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8392; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 8393; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 8394; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 8395; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8396; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 8397; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8398; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 8399; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8400; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8401; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 8402; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8403; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 8404; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8405; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8406; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 8407; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 8408; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 8409; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 8410; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 8411; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8412; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 8413; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 8414; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8415; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 8416; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 8417; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 8418; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8419; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 8420; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8421; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 8422; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8423; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8424; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 8425; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8426; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 8427; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8428; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8429; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 8430; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 8431; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 8432; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 8433; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 8434; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} 8435; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 8436; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 8437; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8438; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 8439; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 8440; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 8441; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8442; AVX512BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 8443; AVX512BW-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 8444; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} 8445; AVX512BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 8446; AVX512BW-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 8447; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 8448; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 8449; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 8450; AVX512BW-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 8451; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} 8452; AVX512BW-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 8453; AVX512BW-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 8454; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8455; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 8456; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rsi) 8457; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rsi) 8458; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rdx) 8459; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx) 8460; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rcx) 8461; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rcx) 8462; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) 8463; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8) 8464; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%r9) 8465; AVX512BW-NEXT: vmovdqa64 %zmm25, (%r9) 8466; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r11) 8467; AVX512BW-NEXT: vmovdqa64 %zmm27, (%r11) 8468; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r10) 8469; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r10) 8470; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) 8471; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) 8472; AVX512BW-NEXT: vzeroupper 8473; AVX512BW-NEXT: retq 8474; 8475; AVX512BW-FCP-LABEL: load_i32_stride8_vf32: 8476; AVX512BW-FCP: # %bb.0: 8477; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 8478; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 8479; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 8480; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 8481; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 8482; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 8483; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 8484; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31 8485; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 8486; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 8487; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 8488; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 8489; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 8490; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 8491; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 8492; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 8493; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 8494; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16 8495; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15 8496; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 8497; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8498; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 8499; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 8500; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 8501; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 8502; AVX512BW-FCP-NEXT: movb $-64, %dil 8503; AVX512BW-FCP-NEXT: kmovd %edi, %k1 8504; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} 8505; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 8506; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 8507; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 8508; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 8509; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] 8510; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 8511; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 8512; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 8513; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 8514; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 8515; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8516; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 8517; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 8518; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 8519; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8520; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 8521; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 8522; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8523; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 8524; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 8525; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 8526; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 8527; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} 8528; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 8529; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 8530; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 8531; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 8532; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 8533; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 8534; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 8535; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 8536; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 8537; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 8538; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} 8539; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 8540; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 8541; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 8542; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] 8543; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 8544; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 8545; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8546; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 8547; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8548; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 8549; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8550; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8551; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 8552; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8553; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 8554; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8555; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8556; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 8557; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 8558; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 8559; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 8560; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 8561; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8562; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 8563; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 8564; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8565; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 8566; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 8567; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 8568; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8569; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 8570; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8571; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 8572; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8573; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8574; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 8575; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8576; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 8577; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8578; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8579; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 8580; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 8581; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 8582; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 8583; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 8584; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8585; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 8586; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 8587; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8588; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 8589; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 8590; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 8591; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8592; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 8593; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8594; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 8595; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8596; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8597; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 8598; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8599; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 8600; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8601; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8602; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 8603; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 8604; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 8605; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 8606; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 8607; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8608; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 8609; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 8610; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8611; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 8612; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 8613; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 8614; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8615; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 8616; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8617; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 8618; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8619; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8620; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 8621; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8622; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 8623; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8624; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8625; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 8626; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 8627; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 8628; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 8629; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 8630; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8631; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 8632; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 8633; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8634; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 8635; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 8636; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 8637; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8638; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 8639; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8640; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 8641; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8642; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8643; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 8644; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8645; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 8646; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8647; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8648; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 8649; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 8650; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 8651; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 8652; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 8653; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} 8654; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 8655; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 8656; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8657; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 8658; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 8659; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 8660; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8661; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 8662; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 8663; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} 8664; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 8665; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 8666; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 8667; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 8668; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 8669; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 8670; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} 8671; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 8672; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 8673; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8674; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 8675; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi) 8676; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) 8677; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx) 8678; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) 8679; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx) 8680; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) 8681; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) 8682; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8) 8683; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9) 8684; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, (%r9) 8685; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11) 8686; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, (%r11) 8687; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10) 8688; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10) 8689; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) 8690; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 8691; AVX512BW-FCP-NEXT: vzeroupper 8692; AVX512BW-FCP-NEXT: retq 8693; 8694; AVX512DQ-BW-LABEL: load_i32_stride8_vf32: 8695; AVX512DQ-BW: # %bb.0: 8696; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 8697; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 8698; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 8699; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 8700; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm29 8701; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 8702; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm30 8703; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm31 8704; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm3 8705; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm7 8706; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm6 8707; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm9 8708; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm5 8709; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm12 8710; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm2 8711; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm14 8712; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm11 8713; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm16 8714; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm15 8715; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 8716; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8717; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17 8718; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 8719; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm18 8720; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 8721; AVX512DQ-BW-NEXT: movb $-64, %dil 8722; AVX512DQ-BW-NEXT: kmovd %edi, %k1 8723; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} 8724; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 8725; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 8726; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8 8727; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 8728; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] 8729; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 8730; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 8731; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 8732; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 8733; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 8734; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8735; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 8736; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 8737; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 8738; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8739; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 8740; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 8741; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8742; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm10 8743; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 8744; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13 8745; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 8746; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} 8747; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 8748; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 8749; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm4 8750; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 8751; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 8752; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 8753; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4 8754; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 8755; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 8756; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 8757; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} 8758; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 8759; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 8760; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 8761; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] 8762; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 8763; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 8764; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8765; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 8766; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8767; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 8768; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8769; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8770; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 8771; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8772; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 8773; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8774; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8775; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 8776; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 8777; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 8778; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 8779; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 8780; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8781; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 8782; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 8783; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8784; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 8785; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 8786; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 8787; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8788; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 8789; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8790; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 8791; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8792; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8793; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 8794; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8795; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 8796; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8797; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8798; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 8799; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 8800; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 8801; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 8802; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 8803; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8804; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 8805; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 8806; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8807; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 8808; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 8809; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 8810; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8811; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 8812; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8813; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 8814; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8815; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8816; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 8817; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8818; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 8819; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8820; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8821; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 8822; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 8823; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 8824; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 8825; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 8826; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8827; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 8828; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 8829; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8830; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 8831; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 8832; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 8833; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8834; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 8835; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8836; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 8837; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8838; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8839; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 8840; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8841; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 8842; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8843; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8844; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 8845; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 8846; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 8847; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 8848; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 8849; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8850; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 8851; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 8852; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8853; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 8854; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 8855; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 8856; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8857; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 8858; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8859; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 8860; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8861; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8862; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 8863; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8864; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 8865; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8866; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8867; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 8868; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 8869; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 8870; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 8871; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 8872; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} 8873; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 8874; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 8875; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 8876; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 8877; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 8878; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 8879; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8880; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 8881; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 8882; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} 8883; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 8884; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 8885; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 8886; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 8887; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 8888; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 8889; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} 8890; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 8891; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 8892; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8893; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 8894; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 64(%rsi) 8895; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rsi) 8896; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%rdx) 8897; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rdx) 8898; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 64(%rcx) 8899; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, (%rcx) 8900; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%r8) 8901; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%r8) 8902; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 64(%r9) 8903; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, (%r9) 8904; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 64(%r11) 8905; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, (%r11) 8906; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 64(%r10) 8907; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r10) 8908; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax) 8909; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) 8910; AVX512DQ-BW-NEXT: vzeroupper 8911; AVX512DQ-BW-NEXT: retq 8912; 8913; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf32: 8914; AVX512DQ-BW-FCP: # %bb.0: 8915; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 8916; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 8917; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 8918; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 8919; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 8920; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 8921; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 8922; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31 8923; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 8924; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 8925; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 8926; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 8927; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 8928; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 8929; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 8930; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 8931; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 8932; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16 8933; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15 8934; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 8935; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8936; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 8937; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 8938; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 8939; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 8940; AVX512DQ-BW-FCP-NEXT: movb $-64, %dil 8941; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 8942; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} 8943; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 8944; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 8945; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 8946; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 8947; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] 8948; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 8949; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 8950; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 8951; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 8952; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 8953; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8954; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 8955; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 8956; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 8957; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8958; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 8959; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 8960; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8961; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 8962; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 8963; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 8964; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 8965; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} 8966; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 8967; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 8968; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 8969; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 8970; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 8971; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 8972; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 8973; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 8974; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 8975; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 8976; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} 8977; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 8978; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 8979; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 8980; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] 8981; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 8982; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 8983; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8984; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 8985; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 8986; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 8987; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 8988; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 8989; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 8990; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 8991; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 8992; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 8993; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 8994; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 8995; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 8996; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 8997; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 8998; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 8999; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 9000; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 9001; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 9002; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 9003; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 9004; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 9005; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 9006; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9007; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 9008; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 9009; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 9010; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 9011; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 9012; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 9013; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 9014; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 9015; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 9016; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 9017; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 9018; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 9019; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 9020; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 9021; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 9022; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 9023; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 9024; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 9025; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 9026; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 9027; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 9028; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 9029; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9030; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 9031; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 9032; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 9033; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 9034; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 9035; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 9036; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 9037; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 9038; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 9039; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 9040; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 9041; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 9042; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 9043; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 9044; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 9045; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 9046; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 9047; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 9048; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 9049; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 9050; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 9051; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 9052; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9053; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 9054; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 9055; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 9056; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 9057; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 9058; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 9059; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 9060; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 9061; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 9062; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 9063; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 9064; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 9065; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 9066; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 9067; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 9068; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 9069; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 9070; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 9071; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 9072; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 9073; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 9074; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 9075; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9076; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 9077; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 9078; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 9079; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 9080; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} 9081; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 9082; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 9083; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 9084; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 9085; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] 9086; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 9087; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 9088; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 9089; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 9090; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 9091; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} 9092; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 9093; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 9094; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 9095; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 9096; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 9097; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 9098; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 9099; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 9100; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 9101; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} 9102; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 9103; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 9104; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 9105; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 9106; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 9107; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 9108; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} 9109; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 9110; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 9111; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 9112; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 9113; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi) 9114; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) 9115; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx) 9116; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) 9117; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx) 9118; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) 9119; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) 9120; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8) 9121; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9) 9122; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, (%r9) 9123; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11) 9124; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, (%r11) 9125; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10) 9126; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10) 9127; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) 9128; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 9129; AVX512DQ-BW-FCP-NEXT: vzeroupper 9130; AVX512DQ-BW-FCP-NEXT: retq 9131 %wide.vec = load <256 x i32>, ptr %in.vec, align 64 9132 %strided.vec0 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248> 9133 %strided.vec1 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249> 9134 %strided.vec2 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122, i32 130, i32 138, i32 146, i32 154, i32 162, i32 170, i32 178, i32 186, i32 194, i32 202, i32 210, i32 218, i32 226, i32 234, i32 242, i32 250> 9135 %strided.vec3 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123, i32 131, i32 139, i32 147, i32 155, i32 163, i32 171, i32 179, i32 187, i32 195, i32 203, i32 211, i32 219, i32 227, i32 235, i32 243, i32 251> 9136 %strided.vec4 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124, i32 132, i32 140, i32 148, i32 156, i32 164, i32 172, i32 180, i32 188, i32 196, i32 204, i32 212, i32 220, i32 228, i32 236, i32 244, i32 252> 9137 %strided.vec5 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125, i32 133, i32 141, i32 149, i32 157, i32 165, i32 173, i32 181, i32 189, i32 197, i32 205, i32 213, i32 221, i32 229, i32 237, i32 245, i32 253> 9138 %strided.vec6 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126, i32 134, i32 142, i32 150, i32 158, i32 166, i32 174, i32 182, i32 190, i32 198, i32 206, i32 214, i32 222, i32 230, i32 238, i32 246, i32 254> 9139 %strided.vec7 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127, i32 135, i32 143, i32 151, i32 159, i32 167, i32 175, i32 183, i32 191, i32 199, i32 207, i32 215, i32 223, i32 231, i32 239, i32 247, i32 255> 9140 store <32 x i32> %strided.vec0, ptr %out.vec0, align 64 9141 store <32 x i32> %strided.vec1, ptr %out.vec1, align 64 9142 store <32 x i32> %strided.vec2, ptr %out.vec2, align 64 9143 store <32 x i32> %strided.vec3, ptr %out.vec3, align 64 9144 store <32 x i32> %strided.vec4, ptr %out.vec4, align 64 9145 store <32 x i32> %strided.vec5, ptr %out.vec5, align 64 9146 store <32 x i32> %strided.vec6, ptr %out.vec6, align 64 9147 store <32 x i32> %strided.vec7, ptr %out.vec7, align 64 9148 ret void 9149} 9150 9151define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { 9152; SSE-LABEL: load_i32_stride8_vf64: 9153; SSE: # %bb.0: 9154; SSE-NEXT: subq $2232, %rsp # imm = 0x8B8 9155; SSE-NEXT: movaps 288(%rdi), %xmm4 9156; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9157; SSE-NEXT: movaps 352(%rdi), %xmm5 9158; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill 9159; SSE-NEXT: movaps 320(%rdi), %xmm6 9160; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9161; SSE-NEXT: movaps 416(%rdi), %xmm7 9162; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9163; SSE-NEXT: movaps 384(%rdi), %xmm8 9164; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9165; SSE-NEXT: movaps 480(%rdi), %xmm9 9166; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9167; SSE-NEXT: movaps 448(%rdi), %xmm3 9168; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9169; SSE-NEXT: movaps 160(%rdi), %xmm10 9170; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9171; SSE-NEXT: movaps 128(%rdi), %xmm1 9172; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9173; SSE-NEXT: movaps 224(%rdi), %xmm2 9174; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9175; SSE-NEXT: movaps 192(%rdi), %xmm0 9176; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9177; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 9178; SSE-NEXT: movaps %xmm1, %xmm2 9179; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] 9180; SSE-NEXT: movaps %xmm2, %xmm1 9181; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 9182; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9183; SSE-NEXT: movaps %xmm3, %xmm1 9184; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] 9185; SSE-NEXT: movaps %xmm8, %xmm3 9186; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] 9187; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 9188; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9189; SSE-NEXT: movaps %xmm3, %xmm0 9190; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 9191; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9192; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] 9193; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9194; SSE-NEXT: movaps %xmm6, %xmm0 9195; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 9196; SSE-NEXT: movaps 256(%rdi), %xmm2 9197; SSE-NEXT: movaps %xmm2, %xmm1 9198; SSE-NEXT: movaps %xmm2, %xmm3 9199; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 9200; SSE-NEXT: movaps %xmm1, %xmm2 9201; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9202; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9203; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9204; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9205; SSE-NEXT: movaps 736(%rdi), %xmm9 9206; SSE-NEXT: movaps 704(%rdi), %xmm0 9207; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9208; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 9209; SSE-NEXT: movaps 672(%rdi), %xmm2 9210; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9211; SSE-NEXT: movaps 640(%rdi), %xmm1 9212; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9213; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9214; SSE-NEXT: movaps %xmm1, %xmm2 9215; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9216; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9217; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9218; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9219; SSE-NEXT: movaps 608(%rdi), %xmm2 9220; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9221; SSE-NEXT: movaps 576(%rdi), %xmm1 9222; SSE-NEXT: movaps %xmm1, %xmm0 9223; SSE-NEXT: movaps %xmm1, %xmm4 9224; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 9225; SSE-NEXT: movaps 544(%rdi), %xmm15 9226; SSE-NEXT: movaps 512(%rdi), %xmm2 9227; SSE-NEXT: movaps %xmm2, %xmm1 9228; SSE-NEXT: movaps %xmm2, %xmm6 9229; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] 9230; SSE-NEXT: movaps %xmm1, %xmm2 9231; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9232; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9233; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9234; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9235; SSE-NEXT: movaps 992(%rdi), %xmm1 9236; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9237; SSE-NEXT: movaps 960(%rdi), %xmm0 9238; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9239; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9240; SSE-NEXT: movaps 928(%rdi), %xmm2 9241; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9242; SSE-NEXT: movaps 896(%rdi), %xmm1 9243; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9244; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9245; SSE-NEXT: movaps %xmm1, %xmm2 9246; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9247; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9248; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9249; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9250; SSE-NEXT: movaps 864(%rdi), %xmm1 9251; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9252; SSE-NEXT: movaps 832(%rdi), %xmm0 9253; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9254; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9255; SSE-NEXT: movaps 800(%rdi), %xmm14 9256; SSE-NEXT: movaps 768(%rdi), %xmm2 9257; SSE-NEXT: movaps %xmm2, %xmm1 9258; SSE-NEXT: movaps %xmm2, %xmm8 9259; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] 9260; SSE-NEXT: movaps %xmm1, %xmm2 9261; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9262; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9263; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9264; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9265; SSE-NEXT: movaps 1248(%rdi), %xmm1 9266; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9267; SSE-NEXT: movaps 1216(%rdi), %xmm0 9268; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9269; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9270; SSE-NEXT: movaps 1184(%rdi), %xmm2 9271; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9272; SSE-NEXT: movaps 1152(%rdi), %xmm1 9273; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9274; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9275; SSE-NEXT: movaps %xmm1, %xmm2 9276; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9277; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9278; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9279; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9280; SSE-NEXT: movaps 1120(%rdi), %xmm2 9281; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9282; SSE-NEXT: movaps 1088(%rdi), %xmm1 9283; SSE-NEXT: movaps %xmm1, %xmm0 9284; SSE-NEXT: movaps %xmm1, %xmm7 9285; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 9286; SSE-NEXT: movaps 1056(%rdi), %xmm2 9287; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9288; SSE-NEXT: movaps 1024(%rdi), %xmm1 9289; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9290; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9291; SSE-NEXT: movaps %xmm1, %xmm2 9292; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9293; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9294; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9295; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9296; SSE-NEXT: movaps 1504(%rdi), %xmm1 9297; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9298; SSE-NEXT: movaps 1472(%rdi), %xmm0 9299; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9300; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9301; SSE-NEXT: movaps 1440(%rdi), %xmm2 9302; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9303; SSE-NEXT: movaps 1408(%rdi), %xmm1 9304; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9305; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9306; SSE-NEXT: movaps %xmm1, %xmm2 9307; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9308; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9309; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9310; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9311; SSE-NEXT: movaps 1376(%rdi), %xmm1 9312; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9313; SSE-NEXT: movaps 1344(%rdi), %xmm0 9314; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9315; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9316; SSE-NEXT: movaps 1312(%rdi), %xmm2 9317; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9318; SSE-NEXT: movaps 1280(%rdi), %xmm1 9319; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9320; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9321; SSE-NEXT: movaps %xmm1, %xmm2 9322; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9323; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9324; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9325; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9326; SSE-NEXT: movaps 1760(%rdi), %xmm1 9327; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9328; SSE-NEXT: movaps 1728(%rdi), %xmm0 9329; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9330; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9331; SSE-NEXT: movaps 1696(%rdi), %xmm2 9332; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9333; SSE-NEXT: movaps 1664(%rdi), %xmm1 9334; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9335; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9336; SSE-NEXT: movaps %xmm1, %xmm2 9337; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9338; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9339; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9340; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9341; SSE-NEXT: movaps 1632(%rdi), %xmm1 9342; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9343; SSE-NEXT: movaps 1600(%rdi), %xmm0 9344; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9345; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9346; SSE-NEXT: movaps 1568(%rdi), %xmm5 9347; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9348; SSE-NEXT: movaps 1536(%rdi), %xmm2 9349; SSE-NEXT: movaps %xmm2, %xmm1 9350; SSE-NEXT: movaps %xmm2, %xmm13 9351; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] 9352; SSE-NEXT: movaps %xmm1, %xmm2 9353; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9354; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9355; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9356; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9357; SSE-NEXT: movaps 2016(%rdi), %xmm1 9358; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9359; SSE-NEXT: movaps 1984(%rdi), %xmm0 9360; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9361; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9362; SSE-NEXT: movaps 1952(%rdi), %xmm2 9363; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9364; SSE-NEXT: movaps 1920(%rdi), %xmm1 9365; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9366; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9367; SSE-NEXT: movaps %xmm1, %xmm2 9368; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9369; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9370; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9371; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9372; SSE-NEXT: movaps 1888(%rdi), %xmm1 9373; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9374; SSE-NEXT: movaps 1856(%rdi), %xmm0 9375; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9376; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9377; SSE-NEXT: movaps 1824(%rdi), %xmm2 9378; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9379; SSE-NEXT: movaps 1792(%rdi), %xmm1 9380; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9381; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9382; SSE-NEXT: movaps %xmm1, %xmm5 9383; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] 9384; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9385; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9386; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9387; SSE-NEXT: movaps 96(%rdi), %xmm2 9388; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9389; SSE-NEXT: movaps 64(%rdi), %xmm12 9390; SSE-NEXT: movaps %xmm12, %xmm0 9391; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 9392; SSE-NEXT: movaps (%rdi), %xmm10 9393; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9394; SSE-NEXT: movaps 32(%rdi), %xmm1 9395; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9396; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] 9397; SSE-NEXT: movaps %xmm10, %xmm5 9398; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] 9399; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9400; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] 9401; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9402; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 9403; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 9404; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] 9405; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 9406; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 9407; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] 9408; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 9409; SSE-NEXT: unpckhps (%rsp), %xmm10 # 16-byte Folded Reload 9410; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] 9411; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 9412; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] 9413; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9414; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 9415; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 9416; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] 9417; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9418; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9419; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 9420; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 9421; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] 9422; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9423; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3] 9424; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9425; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 9426; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] 9427; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9428; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9429; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] 9430; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9431; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 9432; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] 9433; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9434; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm14[2],xmm8[3],xmm14[3] 9435; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9436; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 9437; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 9438; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] 9439; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 9440; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 9441; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] 9442; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 9443; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] 9444; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9445; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9446; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 9447; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] 9448; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9449; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 9450; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 9451; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] 9452; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 9453; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 9454; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] 9455; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9456; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 9457; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] 9458; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9459; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9460; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 9461; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] 9462; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9463; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 9464; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 9465; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] 9466; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9467; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 9468; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 9469; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] 9470; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9471; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 9472; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] 9473; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9474; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 9475; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] 9476; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9477; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9478; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 9479; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] 9480; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 9481; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 9482; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] 9483; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9484; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 9485; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 9486; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] 9487; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9488; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 9489; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 9490; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] 9491; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9492; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 9493; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 9494; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] 9495; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9496; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 9497; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 9498; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] 9499; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 9500; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] 9501; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9502; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 9503; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 9504; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] 9505; SSE-NEXT: movaps %xmm5, %xmm7 9506; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0] 9507; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9508; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm11[1] 9509; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9510; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 9511; SSE-NEXT: movaps %xmm5, %xmm7 9512; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] 9513; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9514; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1] 9515; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9516; SSE-NEXT: movaps %xmm0, %xmm5 9517; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] 9518; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9519; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] 9520; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9521; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9522; SSE-NEXT: movaps %xmm0, %xmm5 9523; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 9524; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] 9525; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9526; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] 9527; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9528; SSE-NEXT: movaps %xmm1, %xmm5 9529; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm15[0] 9530; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9531; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] 9532; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9533; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9534; SSE-NEXT: movaps %xmm0, %xmm5 9535; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9536; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] 9537; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9538; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 9539; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9540; SSE-NEXT: movaps %xmm9, %xmm5 9541; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm14[0] 9542; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9543; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm14[1] 9544; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9545; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9546; SSE-NEXT: movaps %xmm0, %xmm5 9547; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9548; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] 9549; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9550; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 9551; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9552; SSE-NEXT: movaps %xmm6, %xmm5 9553; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm8[0] 9554; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9555; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] 9556; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9557; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9558; SSE-NEXT: movaps %xmm0, %xmm5 9559; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9560; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] 9561; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9562; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 9563; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9564; SSE-NEXT: movaps %xmm4, %xmm5 9565; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9566; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] 9567; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9568; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 9569; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9570; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9571; SSE-NEXT: movaps %xmm0, %xmm5 9572; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9573; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] 9574; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9575; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 9576; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9577; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9578; SSE-NEXT: movaps %xmm0, %xmm4 9579; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] 9580; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9581; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 9582; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9583; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 9584; SSE-NEXT: movaps %xmm6, %xmm0 9585; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9586; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 9587; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9588; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] 9589; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9590; SSE-NEXT: movaps %xmm13, %xmm0 9591; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9592; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 9593; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9594; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] 9595; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9596; SSE-NEXT: movaps %xmm12, %xmm0 9597; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9598; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 9599; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9600; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] 9601; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9602; SSE-NEXT: movaps 240(%rdi), %xmm2 9603; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9604; SSE-NEXT: movaps 208(%rdi), %xmm7 9605; SSE-NEXT: movaps %xmm7, %xmm0 9606; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 9607; SSE-NEXT: movaps 176(%rdi), %xmm3 9608; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9609; SSE-NEXT: movaps 144(%rdi), %xmm2 9610; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9611; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 9612; SSE-NEXT: movaps %xmm2, %xmm1 9613; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 9614; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9615; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 9616; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9617; SSE-NEXT: movaps 368(%rdi), %xmm3 9618; SSE-NEXT: movaps 336(%rdi), %xmm0 9619; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 9620; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 9621; SSE-NEXT: movaps 304(%rdi), %xmm4 9622; SSE-NEXT: movaps 272(%rdi), %xmm1 9623; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9624; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 9625; SSE-NEXT: movaps %xmm1, %xmm2 9626; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9627; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9628; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9629; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9630; SSE-NEXT: movaps 496(%rdi), %xmm5 9631; SSE-NEXT: movaps 464(%rdi), %xmm0 9632; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9633; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 9634; SSE-NEXT: movaps 432(%rdi), %xmm6 9635; SSE-NEXT: movaps 400(%rdi), %xmm1 9636; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9637; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] 9638; SSE-NEXT: movaps %xmm1, %xmm2 9639; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9640; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9641; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9642; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9643; SSE-NEXT: movaps 624(%rdi), %xmm9 9644; SSE-NEXT: movaps 592(%rdi), %xmm0 9645; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9646; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 9647; SSE-NEXT: movaps 560(%rdi), %xmm10 9648; SSE-NEXT: movaps 528(%rdi), %xmm1 9649; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9650; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] 9651; SSE-NEXT: movaps %xmm1, %xmm2 9652; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9653; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9654; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9655; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9656; SSE-NEXT: movaps 752(%rdi), %xmm12 9657; SSE-NEXT: movaps 720(%rdi), %xmm0 9658; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9659; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] 9660; SSE-NEXT: movaps 688(%rdi), %xmm13 9661; SSE-NEXT: movaps 656(%rdi), %xmm2 9662; SSE-NEXT: movaps %xmm2, %xmm1 9663; SSE-NEXT: movaps %xmm2, %xmm15 9664; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] 9665; SSE-NEXT: movaps %xmm1, %xmm2 9666; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9667; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9668; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9669; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9670; SSE-NEXT: movaps 880(%rdi), %xmm1 9671; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9672; SSE-NEXT: movaps 848(%rdi), %xmm0 9673; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9674; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9675; SSE-NEXT: movaps 816(%rdi), %xmm2 9676; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9677; SSE-NEXT: movaps 784(%rdi), %xmm1 9678; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9679; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9680; SSE-NEXT: movaps %xmm1, %xmm2 9681; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9682; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9683; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9684; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9685; SSE-NEXT: movaps 1008(%rdi), %xmm14 9686; SSE-NEXT: movaps 976(%rdi), %xmm0 9687; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9688; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] 9689; SSE-NEXT: movaps 944(%rdi), %xmm2 9690; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9691; SSE-NEXT: movaps 912(%rdi), %xmm1 9692; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9693; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9694; SSE-NEXT: movaps %xmm1, %xmm2 9695; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9696; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9697; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9698; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9699; SSE-NEXT: movaps 1136(%rdi), %xmm1 9700; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9701; SSE-NEXT: movaps 1104(%rdi), %xmm0 9702; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9703; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9704; SSE-NEXT: movaps 1072(%rdi), %xmm2 9705; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9706; SSE-NEXT: movaps 1040(%rdi), %xmm1 9707; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9708; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9709; SSE-NEXT: movaps %xmm1, %xmm2 9710; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9711; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9712; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9713; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9714; SSE-NEXT: movaps 1264(%rdi), %xmm11 9715; SSE-NEXT: movaps 1232(%rdi), %xmm0 9716; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9717; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] 9718; SSE-NEXT: movaps 1200(%rdi), %xmm2 9719; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9720; SSE-NEXT: movaps 1168(%rdi), %xmm1 9721; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9722; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9723; SSE-NEXT: movaps %xmm1, %xmm2 9724; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9725; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9726; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9727; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9728; SSE-NEXT: movaps 1392(%rdi), %xmm1 9729; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9730; SSE-NEXT: movaps 1360(%rdi), %xmm0 9731; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9732; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9733; SSE-NEXT: movaps 1328(%rdi), %xmm2 9734; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9735; SSE-NEXT: movaps 1296(%rdi), %xmm1 9736; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9737; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9738; SSE-NEXT: movaps %xmm1, %xmm2 9739; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9740; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9741; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9742; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9743; SSE-NEXT: movaps 1520(%rdi), %xmm1 9744; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9745; SSE-NEXT: movaps 1488(%rdi), %xmm0 9746; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9747; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9748; SSE-NEXT: movaps 1456(%rdi), %xmm2 9749; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9750; SSE-NEXT: movaps 1424(%rdi), %xmm1 9751; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9752; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9753; SSE-NEXT: movaps %xmm1, %xmm2 9754; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9755; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9756; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9757; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9758; SSE-NEXT: movaps 1648(%rdi), %xmm1 9759; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9760; SSE-NEXT: movaps 1616(%rdi), %xmm0 9761; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9762; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9763; SSE-NEXT: movaps 1584(%rdi), %xmm2 9764; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9765; SSE-NEXT: movaps 1552(%rdi), %xmm1 9766; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9767; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9768; SSE-NEXT: movaps %xmm1, %xmm2 9769; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9770; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9771; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9772; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9773; SSE-NEXT: movaps 1776(%rdi), %xmm8 9774; SSE-NEXT: movaps 1744(%rdi), %xmm0 9775; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9776; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] 9777; SSE-NEXT: movaps 1712(%rdi), %xmm2 9778; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9779; SSE-NEXT: movaps 1680(%rdi), %xmm1 9780; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9781; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9782; SSE-NEXT: movaps %xmm1, %xmm2 9783; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9784; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9785; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9786; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9787; SSE-NEXT: movaps 1904(%rdi), %xmm1 9788; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9789; SSE-NEXT: movaps 1872(%rdi), %xmm0 9790; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9791; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9792; SSE-NEXT: movaps 1840(%rdi), %xmm2 9793; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9794; SSE-NEXT: movaps 1808(%rdi), %xmm1 9795; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9796; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9797; SSE-NEXT: movaps %xmm1, %xmm2 9798; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9799; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9800; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9801; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9802; SSE-NEXT: movaps 2032(%rdi), %xmm1 9803; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9804; SSE-NEXT: movaps 2000(%rdi), %xmm0 9805; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9806; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9807; SSE-NEXT: movaps 1968(%rdi), %xmm2 9808; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9809; SSE-NEXT: movaps 1936(%rdi), %xmm1 9810; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9811; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9812; SSE-NEXT: movaps %xmm1, %xmm2 9813; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9814; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9815; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9816; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9817; SSE-NEXT: movaps 112(%rdi), %xmm1 9818; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9819; SSE-NEXT: movaps 80(%rdi), %xmm0 9820; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9821; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9822; SSE-NEXT: movaps 16(%rdi), %xmm1 9823; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9824; SSE-NEXT: movaps 48(%rdi), %xmm2 9825; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9826; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9827; SSE-NEXT: movaps %xmm1, %xmm2 9828; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 9829; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9830; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 9831; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9832; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 9833; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] 9834; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9835; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 9836; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] 9837; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload 9838; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 9839; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill 9840; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9841; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] 9842; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9843; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9844; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] 9845; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9846; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9847; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] 9848; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9849; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] 9850; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9851; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 9852; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm10[2],xmm5[3],xmm10[3] 9853; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 9854; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm12[2],xmm6[3],xmm12[3] 9855; SSE-NEXT: movaps %xmm15, %xmm4 9856; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] 9857; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 9858; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 9859; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] 9860; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9861; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 9862; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 9863; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] 9864; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 9865; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm14[2],xmm3[3],xmm14[3] 9866; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 9867; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 9868; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] 9869; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9870; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 9871; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 9872; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] 9873; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 9874; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 9875; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] 9876; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9877; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 9878; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] 9879; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 9880; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 9881; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] 9882; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9883; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 9884; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 9885; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] 9886; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 9887; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 9888; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] 9889; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9890; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 9891; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 9892; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] 9893; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 9894; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 9895; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] 9896; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9897; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 9898; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 9899; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] 9900; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 9901; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 9902; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] 9903; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9904; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 9905; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] 9906; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 9907; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 9908; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] 9909; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9910; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 9911; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 9912; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] 9913; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9914; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9915; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 9916; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9917; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9918; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9919; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 9920; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9921; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9922; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9923; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 9924; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9925; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9926; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9927; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 9928; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9929; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9930; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9931; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 9932; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9933; SSE-NEXT: movaps %xmm2, %xmm0 9934; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] 9935; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9936; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] 9937; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9938; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9939; SSE-NEXT: movaps %xmm0, %xmm2 9940; SSE-NEXT: movaps (%rsp), %xmm7 # 16-byte Reload 9941; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] 9942; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9943; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] 9944; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9945; SSE-NEXT: movaps %xmm1, %xmm0 9946; SSE-NEXT: movaps %xmm1, %xmm2 9947; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9948; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] 9949; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9950; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 9951; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9952; SSE-NEXT: movaps %xmm5, %xmm2 9953; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9954; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] 9955; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9956; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 9957; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9958; SSE-NEXT: movaps %xmm4, %xmm2 9959; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] 9960; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9961; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] 9962; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill 9963; SSE-NEXT: movaps %xmm15, %xmm0 9964; SSE-NEXT: movaps %xmm15, %xmm2 9965; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9966; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] 9967; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9968; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 9969; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9970; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9971; SSE-NEXT: movaps %xmm0, %xmm15 9972; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0] 9973; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] 9974; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9975; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9976; SSE-NEXT: movaps %xmm0, %xmm2 9977; SSE-NEXT: movaps %xmm14, %xmm1 9978; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0] 9979; SSE-NEXT: movaps %xmm2, %xmm14 9980; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 9981; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9982; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9983; SSE-NEXT: movaps %xmm0, %xmm2 9984; SSE-NEXT: movaps %xmm13, %xmm1 9985; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm13[0] 9986; SSE-NEXT: movaps %xmm2, %xmm13 9987; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 9988; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9989; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9990; SSE-NEXT: movaps %xmm0, %xmm7 9991; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0] 9992; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] 9993; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9994; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9995; SSE-NEXT: movaps %xmm0, %xmm6 9996; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm12[0] 9997; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] 9998; SSE-NEXT: movaps %xmm0, %xmm12 9999; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10000; SSE-NEXT: movaps %xmm0, %xmm2 10001; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0] 10002; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] 10003; SSE-NEXT: movaps %xmm0, %xmm10 10004; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10005; SSE-NEXT: movaps %xmm0, %xmm5 10006; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm9[0] 10007; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] 10008; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10009; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10010; SSE-NEXT: movaps %xmm0, %xmm4 10011; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0] 10012; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] 10013; SSE-NEXT: movaps %xmm0, %xmm8 10014; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10015; SSE-NEXT: movaps %xmm0, %xmm3 10016; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10017; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] 10018; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 10019; SSE-NEXT: movaps %xmm0, %xmm9 10020; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 10021; SSE-NEXT: movaps %xmm11, %xmm0 10022; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10023; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 10024; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] 10025; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10026; SSE-NEXT: movaps %xmm1, 224(%rsi) 10027; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10028; SSE-NEXT: movaps %xmm1, 160(%rsi) 10029; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10030; SSE-NEXT: movaps %xmm1, 96(%rsi) 10031; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10032; SSE-NEXT: movaps %xmm1, 32(%rsi) 10033; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10034; SSE-NEXT: movaps %xmm1, 240(%rsi) 10035; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10036; SSE-NEXT: movaps %xmm1, 176(%rsi) 10037; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10038; SSE-NEXT: movaps %xmm1, 112(%rsi) 10039; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10040; SSE-NEXT: movaps %xmm1, 48(%rsi) 10041; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10042; SSE-NEXT: movaps %xmm1, 192(%rsi) 10043; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10044; SSE-NEXT: movaps %xmm1, 128(%rsi) 10045; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10046; SSE-NEXT: movaps %xmm1, 64(%rsi) 10047; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10048; SSE-NEXT: movaps %xmm1, (%rsi) 10049; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10050; SSE-NEXT: movaps %xmm1, 208(%rsi) 10051; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10052; SSE-NEXT: movaps %xmm1, 144(%rsi) 10053; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10054; SSE-NEXT: movaps %xmm1, 80(%rsi) 10055; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10056; SSE-NEXT: movaps %xmm1, 16(%rsi) 10057; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10058; SSE-NEXT: movaps %xmm1, 224(%rdx) 10059; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10060; SSE-NEXT: movaps %xmm1, 240(%rdx) 10061; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10062; SSE-NEXT: movaps %xmm1, 192(%rdx) 10063; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10064; SSE-NEXT: movaps %xmm1, 208(%rdx) 10065; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10066; SSE-NEXT: movaps %xmm1, 160(%rdx) 10067; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10068; SSE-NEXT: movaps %xmm1, 176(%rdx) 10069; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10070; SSE-NEXT: movaps %xmm1, 128(%rdx) 10071; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10072; SSE-NEXT: movaps %xmm1, 144(%rdx) 10073; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10074; SSE-NEXT: movaps %xmm1, 96(%rdx) 10075; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10076; SSE-NEXT: movaps %xmm1, 112(%rdx) 10077; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10078; SSE-NEXT: movaps %xmm1, 64(%rdx) 10079; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10080; SSE-NEXT: movaps %xmm1, 80(%rdx) 10081; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10082; SSE-NEXT: movaps %xmm1, 32(%rdx) 10083; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10084; SSE-NEXT: movaps %xmm1, 48(%rdx) 10085; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10086; SSE-NEXT: movaps %xmm1, (%rdx) 10087; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10088; SSE-NEXT: movaps %xmm1, 16(%rdx) 10089; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10090; SSE-NEXT: movaps %xmm1, 240(%rcx) 10091; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10092; SSE-NEXT: movaps %xmm1, 224(%rcx) 10093; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10094; SSE-NEXT: movaps %xmm1, 208(%rcx) 10095; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10096; SSE-NEXT: movaps %xmm1, 192(%rcx) 10097; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10098; SSE-NEXT: movaps %xmm1, 176(%rcx) 10099; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10100; SSE-NEXT: movaps %xmm1, 160(%rcx) 10101; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10102; SSE-NEXT: movaps %xmm1, 144(%rcx) 10103; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10104; SSE-NEXT: movaps %xmm1, 128(%rcx) 10105; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10106; SSE-NEXT: movaps %xmm1, 112(%rcx) 10107; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10108; SSE-NEXT: movaps %xmm1, 96(%rcx) 10109; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10110; SSE-NEXT: movaps %xmm1, 80(%rcx) 10111; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10112; SSE-NEXT: movaps %xmm1, 64(%rcx) 10113; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10114; SSE-NEXT: movaps %xmm1, 48(%rcx) 10115; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10116; SSE-NEXT: movaps %xmm1, 32(%rcx) 10117; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10118; SSE-NEXT: movaps %xmm1, 16(%rcx) 10119; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10120; SSE-NEXT: movaps %xmm1, (%rcx) 10121; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10122; SSE-NEXT: movaps %xmm1, 240(%r8) 10123; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10124; SSE-NEXT: movaps %xmm1, 224(%r8) 10125; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10126; SSE-NEXT: movaps %xmm1, 208(%r8) 10127; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10128; SSE-NEXT: movaps %xmm1, 192(%r8) 10129; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10130; SSE-NEXT: movaps %xmm1, 176(%r8) 10131; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10132; SSE-NEXT: movaps %xmm1, 160(%r8) 10133; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10134; SSE-NEXT: movaps %xmm1, 144(%r8) 10135; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10136; SSE-NEXT: movaps %xmm1, 128(%r8) 10137; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10138; SSE-NEXT: movaps %xmm1, 112(%r8) 10139; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10140; SSE-NEXT: movaps %xmm1, 96(%r8) 10141; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10142; SSE-NEXT: movaps %xmm1, 80(%r8) 10143; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10144; SSE-NEXT: movaps %xmm1, 64(%r8) 10145; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10146; SSE-NEXT: movaps %xmm1, 48(%r8) 10147; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10148; SSE-NEXT: movaps %xmm1, 32(%r8) 10149; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10150; SSE-NEXT: movaps %xmm1, 16(%r8) 10151; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10152; SSE-NEXT: movaps %xmm1, (%r8) 10153; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10154; SSE-NEXT: movaps %xmm1, 240(%r9) 10155; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10156; SSE-NEXT: movaps %xmm1, 224(%r9) 10157; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10158; SSE-NEXT: movaps %xmm1, 208(%r9) 10159; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10160; SSE-NEXT: movaps %xmm1, 192(%r9) 10161; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10162; SSE-NEXT: movaps %xmm1, 176(%r9) 10163; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10164; SSE-NEXT: movaps %xmm1, 160(%r9) 10165; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10166; SSE-NEXT: movaps %xmm1, 144(%r9) 10167; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10168; SSE-NEXT: movaps %xmm1, 128(%r9) 10169; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10170; SSE-NEXT: movaps %xmm1, 112(%r9) 10171; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10172; SSE-NEXT: movaps %xmm1, 96(%r9) 10173; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10174; SSE-NEXT: movaps %xmm1, 80(%r9) 10175; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10176; SSE-NEXT: movaps %xmm1, 64(%r9) 10177; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10178; SSE-NEXT: movaps %xmm1, 48(%r9) 10179; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10180; SSE-NEXT: movaps %xmm1, 32(%r9) 10181; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10182; SSE-NEXT: movaps %xmm1, 16(%r9) 10183; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10184; SSE-NEXT: movaps %xmm1, (%r9) 10185; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 10186; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10187; SSE-NEXT: movaps %xmm1, 240(%rax) 10188; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10189; SSE-NEXT: movaps %xmm1, 224(%rax) 10190; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10191; SSE-NEXT: movaps %xmm1, 208(%rax) 10192; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10193; SSE-NEXT: movaps %xmm1, 192(%rax) 10194; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10195; SSE-NEXT: movaps %xmm1, 176(%rax) 10196; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10197; SSE-NEXT: movaps %xmm1, 160(%rax) 10198; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10199; SSE-NEXT: movaps %xmm1, 144(%rax) 10200; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10201; SSE-NEXT: movaps %xmm1, 128(%rax) 10202; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10203; SSE-NEXT: movaps %xmm1, 112(%rax) 10204; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10205; SSE-NEXT: movaps %xmm1, 96(%rax) 10206; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10207; SSE-NEXT: movaps %xmm1, 80(%rax) 10208; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10209; SSE-NEXT: movaps %xmm1, 64(%rax) 10210; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10211; SSE-NEXT: movaps %xmm1, 48(%rax) 10212; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10213; SSE-NEXT: movaps %xmm1, 32(%rax) 10214; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10215; SSE-NEXT: movaps %xmm1, 16(%rax) 10216; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10217; SSE-NEXT: movaps %xmm1, (%rax) 10218; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 10219; SSE-NEXT: movaps %xmm3, 240(%rax) 10220; SSE-NEXT: movaps %xmm4, 224(%rax) 10221; SSE-NEXT: movaps %xmm5, 208(%rax) 10222; SSE-NEXT: movaps %xmm2, 192(%rax) 10223; SSE-NEXT: movaps %xmm6, 176(%rax) 10224; SSE-NEXT: movaps %xmm7, 160(%rax) 10225; SSE-NEXT: movaps %xmm13, 144(%rax) 10226; SSE-NEXT: movaps %xmm14, 128(%rax) 10227; SSE-NEXT: movaps %xmm15, 112(%rax) 10228; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10229; SSE-NEXT: movaps %xmm1, 96(%rax) 10230; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10231; SSE-NEXT: movaps %xmm1, 80(%rax) 10232; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10233; SSE-NEXT: movaps %xmm1, 64(%rax) 10234; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10235; SSE-NEXT: movaps %xmm1, 48(%rax) 10236; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10237; SSE-NEXT: movaps %xmm1, 32(%rax) 10238; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10239; SSE-NEXT: movaps %xmm1, 16(%rax) 10240; SSE-NEXT: movaps %xmm0, (%rax) 10241; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 10242; SSE-NEXT: movaps %xmm9, 240(%rax) 10243; SSE-NEXT: movaps %xmm8, 224(%rax) 10244; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10245; SSE-NEXT: movaps %xmm0, 208(%rax) 10246; SSE-NEXT: movaps %xmm10, 192(%rax) 10247; SSE-NEXT: movaps %xmm12, 176(%rax) 10248; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10249; SSE-NEXT: movaps %xmm0, 160(%rax) 10250; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10251; SSE-NEXT: movaps %xmm0, 144(%rax) 10252; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10253; SSE-NEXT: movaps %xmm0, 128(%rax) 10254; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10255; SSE-NEXT: movaps %xmm0, 112(%rax) 10256; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10257; SSE-NEXT: movaps %xmm0, 96(%rax) 10258; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 10259; SSE-NEXT: movaps %xmm0, 80(%rax) 10260; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10261; SSE-NEXT: movaps %xmm0, 64(%rax) 10262; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10263; SSE-NEXT: movaps %xmm0, 48(%rax) 10264; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10265; SSE-NEXT: movaps %xmm0, 32(%rax) 10266; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10267; SSE-NEXT: movaps %xmm0, 16(%rax) 10268; SSE-NEXT: movaps %xmm11, (%rax) 10269; SSE-NEXT: addq $2232, %rsp # imm = 0x8B8 10270; SSE-NEXT: retq 10271; 10272; AVX-LABEL: load_i32_stride8_vf64: 10273; AVX: # %bb.0: 10274; AVX-NEXT: subq $3720, %rsp # imm = 0xE88 10275; AVX-NEXT: vmovaps 288(%rdi), %xmm13 10276; AVX-NEXT: vmovaps 256(%rdi), %xmm15 10277; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] 10278; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10279; AVX-NEXT: vmovaps 352(%rdi), %xmm1 10280; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10281; AVX-NEXT: vmovaps 320(%rdi), %xmm2 10282; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10283; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 10284; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10285; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 10286; AVX-NEXT: vmovaps 416(%rdi), %xmm1 10287; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10288; AVX-NEXT: vmovaps 384(%rdi), %xmm2 10289; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10290; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 10291; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10292; AVX-NEXT: vmovaps 480(%rdi), %xmm2 10293; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10294; AVX-NEXT: vmovaps 448(%rdi), %xmm3 10295; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10296; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 10297; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10298; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] 10299; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 10300; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] 10301; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 10302; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10303; AVX-NEXT: vmovaps 928(%rdi), %xmm1 10304; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10305; AVX-NEXT: vmovaps 896(%rdi), %xmm0 10306; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10307; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 10308; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 10309; AVX-NEXT: vmovaps 992(%rdi), %xmm1 10310; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10311; AVX-NEXT: vmovaps 960(%rdi), %xmm2 10312; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10313; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 10314; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10315; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] 10316; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10317; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 10318; AVX-NEXT: vmovaps 800(%rdi), %xmm1 10319; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10320; AVX-NEXT: vmovaps 768(%rdi), %xmm14 10321; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] 10322; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10323; AVX-NEXT: vmovaps 864(%rdi), %xmm2 10324; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10325; AVX-NEXT: vmovaps 832(%rdi), %xmm3 10326; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10327; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 10328; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10329; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 10330; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10331; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10332; AVX-NEXT: vmovaps 1440(%rdi), %xmm1 10333; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10334; AVX-NEXT: vmovaps 1408(%rdi), %xmm0 10335; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10336; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 10337; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 10338; AVX-NEXT: vmovaps 1504(%rdi), %xmm1 10339; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10340; AVX-NEXT: vmovaps 1472(%rdi), %xmm2 10341; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10342; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 10343; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10344; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] 10345; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10346; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 10347; AVX-NEXT: vmovaps 1312(%rdi), %xmm2 10348; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10349; AVX-NEXT: vmovaps 1280(%rdi), %xmm1 10350; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10351; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 10352; AVX-NEXT: vmovaps 1376(%rdi), %xmm2 10353; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10354; AVX-NEXT: vmovaps 1344(%rdi), %xmm3 10355; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10356; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 10357; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10358; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 10359; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10360; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10361; AVX-NEXT: vmovaps 1952(%rdi), %xmm0 10362; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10363; AVX-NEXT: vmovaps 1920(%rdi), %xmm1 10364; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10365; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 10366; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 10367; AVX-NEXT: vmovaps 2016(%rdi), %xmm1 10368; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10369; AVX-NEXT: vmovaps 1984(%rdi), %xmm2 10370; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10371; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 10372; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10373; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] 10374; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10375; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 10376; AVX-NEXT: vmovaps 1824(%rdi), %xmm1 10377; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10378; AVX-NEXT: vmovaps 1792(%rdi), %xmm5 10379; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] 10380; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10381; AVX-NEXT: vmovaps 1888(%rdi), %xmm2 10382; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10383; AVX-NEXT: vmovaps 1856(%rdi), %xmm3 10384; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10385; AVX-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 10386; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] 10387; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10388; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10389; AVX-NEXT: vmovaps 160(%rdi), %xmm0 10390; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10391; AVX-NEXT: vmovaps 128(%rdi), %xmm1 10392; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10393; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 10394; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 10395; AVX-NEXT: vmovaps 224(%rdi), %xmm1 10396; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10397; AVX-NEXT: vmovaps 192(%rdi), %xmm2 10398; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10399; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 10400; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10401; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] 10402; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10403; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 10404; AVX-NEXT: vmovaps 32(%rdi), %xmm1 10405; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill 10406; AVX-NEXT: vmovaps (%rdi), %xmm4 10407; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 10408; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10409; AVX-NEXT: vmovaps 96(%rdi), %xmm2 10410; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10411; AVX-NEXT: vmovaps 64(%rdi), %xmm3 10412; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10413; AVX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 10414; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] 10415; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10416; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10417; AVX-NEXT: vmovaps 672(%rdi), %xmm1 10418; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10419; AVX-NEXT: vmovaps 640(%rdi), %xmm0 10420; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10421; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 10422; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 10423; AVX-NEXT: vmovaps 736(%rdi), %xmm1 10424; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10425; AVX-NEXT: vmovaps 704(%rdi), %xmm2 10426; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10427; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 10428; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10429; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm1[0,1,0,1] 10430; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 10431; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] 10432; AVX-NEXT: vmovaps 544(%rdi), %xmm0 10433; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10434; AVX-NEXT: vmovaps 512(%rdi), %xmm2 10435; AVX-NEXT: vunpcklps {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 10436; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10437; AVX-NEXT: vmovaps 608(%rdi), %xmm0 10438; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10439; AVX-NEXT: vmovaps 576(%rdi), %xmm1 10440; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10441; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 10442; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] 10443; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] 10444; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10445; AVX-NEXT: vmovaps 1184(%rdi), %xmm0 10446; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10447; AVX-NEXT: vmovaps 1152(%rdi), %xmm6 10448; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10449; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] 10450; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 10451; AVX-NEXT: vmovaps 1248(%rdi), %xmm0 10452; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10453; AVX-NEXT: vmovaps 1216(%rdi), %xmm6 10454; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10455; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] 10456; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10457; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm0[0,1,0,1] 10458; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 10459; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7] 10460; AVX-NEXT: vmovaps 1056(%rdi), %xmm0 10461; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10462; AVX-NEXT: vmovaps 1024(%rdi), %xmm6 10463; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10464; AVX-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] 10465; AVX-NEXT: vmovaps 1120(%rdi), %xmm0 10466; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10467; AVX-NEXT: vmovaps 1088(%rdi), %xmm6 10468; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10469; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] 10470; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10471; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] 10472; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] 10473; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10474; AVX-NEXT: vmovaps 1696(%rdi), %xmm0 10475; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10476; AVX-NEXT: vmovaps 1664(%rdi), %xmm6 10477; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10478; AVX-NEXT: vunpcklps {{.*#+}} xmm8 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] 10479; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm9 10480; AVX-NEXT: vmovaps 1760(%rdi), %xmm0 10481; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10482; AVX-NEXT: vmovaps 1728(%rdi), %xmm6 10483; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10484; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] 10485; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10486; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm0[0,1,0,1] 10487; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 10488; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5],ymm11[6,7] 10489; AVX-NEXT: vmovaps 1568(%rdi), %xmm12 10490; AVX-NEXT: vmovaps 1536(%rdi), %xmm8 10491; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] 10492; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10493; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10494; AVX-NEXT: vmovaps 1632(%rdi), %xmm6 10495; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10496; AVX-NEXT: vmovaps 1600(%rdi), %xmm7 10497; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10498; AVX-NEXT: vunpcklps {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] 10499; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] 10500; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] 10501; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10502; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] 10503; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3] 10504; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 10505; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3] 10506; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload 10507; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 10508; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm7[1,1,1,1] 10509; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 10510; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm6[1],xmm15[2,3] 10511; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 10512; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] 10513; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] 10514; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10515; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] 10516; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 10517; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 10518; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 10519; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3] 10520; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload 10521; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 10522; AVX-NEXT: # xmm15 = mem[1,1,1,1] 10523; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 10524; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm14[1],xmm15[2,3] 10525; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 10526; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] 10527; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] 10528; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10529; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 10530; AVX-NEXT: # xmm0 = mem[1,1,1,1] 10531; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 10532; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 10533; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 10534; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3] 10535; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload 10536; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 10537; AVX-NEXT: # xmm15 = mem[1,1,1,1] 10538; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload 10539; AVX-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3] 10540; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 10541; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] 10542; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] 10543; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10544; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] 10545; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 10546; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 10547; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] 10548; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload 10549; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 10550; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,1,1,1] 10551; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 10552; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm15[1],xmm10[2,3] 10553; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 10554; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] 10555; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] 10556; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10557; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1,1,1] 10558; AVX-NEXT: vblendps $2, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 10559; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 10560; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] 10561; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload 10562; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 10563; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] 10564; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 10565; AVX-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] 10566; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 10567; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] 10568; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 10569; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10570; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1,1,1] 10571; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 10572; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 10573; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 10574; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload 10575; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 10576; AVX-NEXT: # xmm2 = mem[1,1,1,1] 10577; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 10578; AVX-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 10579; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 10580; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 10581; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 10582; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10583; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 10584; AVX-NEXT: # xmm0 = mem[1,1,1,1] 10585; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 10586; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 10587; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 10588; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3] 10589; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload 10590; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 10591; AVX-NEXT: # xmm2 = mem[1,1,1,1] 10592; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 10593; AVX-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 10594; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 10595; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 10596; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 10597; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10598; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] 10599; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] 10600; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] 10601; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload 10602; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 10603; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] 10604; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 10605; AVX-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 10606; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 10607; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 10608; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 10609; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10610; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10611; AVX-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] 10612; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10613; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] 10614; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10615; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 10616; AVX-NEXT: # xmm0 = mem[2,2,2,2] 10617; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 10618; AVX-NEXT: # xmm0 = mem[0,1,2],xmm0[3] 10619; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 10620; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10621; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 10622; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 10623; AVX-NEXT: # xmm1 = mem[2,2,2,2] 10624; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 10625; AVX-NEXT: # xmm1 = mem[0,1,2],xmm1[3] 10626; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 10627; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10628; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10629; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10630; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload 10631; AVX-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] 10632; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10633; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10634; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] 10635; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10636; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 10637; AVX-NEXT: # xmm0 = mem[2,2,2,2] 10638; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 10639; AVX-NEXT: # xmm0 = mem[0,1,2],xmm0[3] 10640; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 10641; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10642; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 10643; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 10644; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] 10645; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 10646; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] 10647; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 10648; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10649; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10650; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10651; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload 10652; AVX-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] 10653; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10654; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10655; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 10656; AVX-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 10657; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10658; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 10659; AVX-NEXT: # xmm0 = mem[2,2,2,2] 10660; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 10661; AVX-NEXT: # xmm0 = mem[0,1,2],xmm0[3] 10662; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 10663; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10664; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 10665; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 10666; AVX-NEXT: # xmm1 = mem[2,2,2,2] 10667; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 10668; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] 10669; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 10670; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10671; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10672; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10673; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload 10674; AVX-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] 10675; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10676; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] 10677; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10678; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 10679; AVX-NEXT: # xmm0 = mem[2,2,2,2] 10680; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 10681; AVX-NEXT: # xmm0 = mem[0,1,2],xmm0[3] 10682; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 10683; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10684; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 10685; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 10686; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] 10687; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 10688; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] 10689; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] 10690; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10691; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10692; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10693; AVX-NEXT: vunpckhps (%rsp), %xmm0, %xmm8 # 16-byte Folded Reload 10694; AVX-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] 10695; AVX-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill 10696; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload 10697; AVX-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] 10698; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10699; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 10700; AVX-NEXT: # xmm0 = mem[2,2,2,2] 10701; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 10702; AVX-NEXT: # xmm0 = mem[0,1,2],xmm0[3] 10703; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 10704; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10705; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 10706; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 10707; AVX-NEXT: # xmm1 = mem[2,2,2,2] 10708; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 10709; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] 10710; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] 10711; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10712; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10713; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10714; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload 10715; AVX-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] 10716; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10717; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10718; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 10719; AVX-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 10720; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10721; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 10722; AVX-NEXT: # xmm15 = mem[2,2,2,2] 10723; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload 10724; AVX-NEXT: # xmm15 = mem[0,1,2],xmm15[3] 10725; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 10726; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 10727; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 10728; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 10729; AVX-NEXT: # xmm15 = mem[2,2,2,2] 10730; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10731; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] 10732; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1],xmm15[2,3] 10733; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm14[4,5,6,7] 10734; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10735; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 10736; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload 10737; AVX-NEXT: # xmm0 = xmm8[2],mem[2],xmm8[3],mem[3] 10738; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10739; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 10740; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload 10741; AVX-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] 10742; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10743; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 10744; AVX-NEXT: # xmm13 = mem[2,2,2,2] 10745; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 10746; AVX-NEXT: # xmm13 = mem[0,1,2],xmm13[3] 10747; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 10748; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm12 10749; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] 10750; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 10751; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm15[2,2,2,2] 10752; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 10753; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] 10754; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] 10755; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm12[4,5,6,7] 10756; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10757; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 10758; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload 10759; AVX-NEXT: # xmm0 = xmm8[2],mem[2],xmm8[3],mem[3] 10760; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10761; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm8 # 16-byte Folded Reload 10762; AVX-NEXT: # xmm8 = xmm9[2],mem[2],xmm9[3],mem[3] 10763; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10764; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 10765; AVX-NEXT: # xmm11 = mem[2,2,2,2] 10766; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 10767; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] 10768; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 10769; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm9 10770; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] 10771; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 10772; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm12[2,2,2,2] 10773; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 10774; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] 10775; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm0[0,1],xmm11[2,3] 10776; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm9[4,5,6,7] 10777; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10778; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 10779; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload 10780; AVX-NEXT: # xmm9 = xmm8[2],mem[2],xmm8[3],mem[3] 10781; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 10782; AVX-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm9[1] 10783; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 10784; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm11 # 16-byte Folded Reload 10785; AVX-NEXT: # xmm11 = xmm8[2],mem[2],xmm8[3],mem[3] 10786; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 10787; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 10788; AVX-NEXT: # xmm8 = mem[2,3,2,3] 10789; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 10790; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] 10791; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 10792; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10793; AVX-NEXT: vunpckhps {{.*#+}} xmm8 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] 10794; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 10795; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm8[1] 10796; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 10797; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload 10798; AVX-NEXT: # xmm8 = xmm6[2],mem[2],xmm6[3],mem[3] 10799; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 10800; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 10801; AVX-NEXT: # xmm9 = mem[2,3,2,3] 10802; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 10803; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] 10804; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 10805; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10806; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload 10807; AVX-NEXT: # xmm7 = xmm4[2],mem[2],xmm4[3],mem[3] 10808; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 10809; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm4[1],xmm7[1] 10810; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 10811; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm8 # 16-byte Folded Reload 10812; AVX-NEXT: # xmm8 = xmm4[2],mem[2],xmm4[3],mem[3] 10813; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 10814; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 10815; AVX-NEXT: # xmm6 = mem[2,3,2,3] 10816; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 10817; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] 10818; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 10819; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10820; AVX-NEXT: vunpckhps {{.*#+}} xmm6 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] 10821; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 10822; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm6[1] 10823; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 10824; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload 10825; AVX-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] 10826; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 10827; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 10828; AVX-NEXT: # xmm5 = mem[2,3,2,3] 10829; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 10830; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] 10831; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] 10832; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10833; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload 10834; AVX-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] 10835; AVX-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload 10836; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] 10837; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 10838; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload 10839; AVX-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] 10840; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 10841; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 10842; AVX-NEXT: # xmm3 = mem[2,3,2,3] 10843; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 10844; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] 10845; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 10846; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10847; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload 10848; AVX-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] 10849; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10850; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 10851; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10852; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload 10853; AVX-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] 10854; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 10855; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 10856; AVX-NEXT: # xmm1 = mem[2,3,2,3] 10857; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10858; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] 10859; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 10860; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10861; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] 10862; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10863; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 10864; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10865; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 10866; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] 10867; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10868; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 10869; AVX-NEXT: # xmm2 = mem[2,3,2,3] 10870; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 10871; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 10872; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 10873; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10874; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] 10875; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10876; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 10877; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload 10878; AVX-NEXT: # xmm1 = xmm13[2],mem[2],xmm13[3],mem[3] 10879; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10880; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 10881; AVX-NEXT: # xmm2 = mem[2,3,2,3] 10882; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 10883; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 10884; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 10885; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10886; AVX-NEXT: vmovaps 416(%rdi), %ymm2 10887; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10888; AVX-NEXT: vmovaps 384(%rdi), %ymm3 10889; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10890; AVX-NEXT: vmovaps 448(%rdi), %ymm4 10891; AVX-NEXT: vmovaps 480(%rdi), %ymm0 10892; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10893; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] 10894; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 10895; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] 10896; AVX-NEXT: vmovaps 320(%rdi), %ymm6 10897; AVX-NEXT: vmovaps 352(%rdi), %ymm13 10898; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm6[0],ymm13[2],ymm6[2] 10899; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10900; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 10901; AVX-NEXT: vmovaps 288(%rdi), %ymm8 10902; AVX-NEXT: vmovaps 256(%rdi), %ymm7 10903; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] 10904; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 10905; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] 10906; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10907; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10908; AVX-NEXT: vmovaps 672(%rdi), %ymm2 10909; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10910; AVX-NEXT: vmovaps 640(%rdi), %ymm9 10911; AVX-NEXT: vmovaps 704(%rdi), %ymm12 10912; AVX-NEXT: vmovaps 736(%rdi), %ymm10 10913; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] 10914; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[4],ymm2[4],ymm9[5],ymm2[5] 10915; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] 10916; AVX-NEXT: vmovaps 576(%rdi), %ymm14 10917; AVX-NEXT: vmovaps 608(%rdi), %ymm11 10918; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] 10919; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10920; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 10921; AVX-NEXT: vmovaps 544(%rdi), %ymm2 10922; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10923; AVX-NEXT: vmovaps 512(%rdi), %ymm3 10924; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10925; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 10926; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 10927; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] 10928; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10929; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10930; AVX-NEXT: vmovaps 928(%rdi), %ymm2 10931; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10932; AVX-NEXT: vmovaps 896(%rdi), %ymm3 10933; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10934; AVX-NEXT: vmovaps 960(%rdi), %ymm1 10935; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10936; AVX-NEXT: vmovaps 992(%rdi), %ymm0 10937; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10938; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 10939; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 10940; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] 10941; AVX-NEXT: vmovaps 832(%rdi), %ymm2 10942; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10943; AVX-NEXT: vmovaps 864(%rdi), %ymm1 10944; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10945; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] 10946; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 10947; AVX-NEXT: vmovaps 800(%rdi), %ymm3 10948; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10949; AVX-NEXT: vmovaps 768(%rdi), %ymm2 10950; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10951; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 10952; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 10953; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] 10954; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10955; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10956; AVX-NEXT: vmovaps 1184(%rdi), %ymm2 10957; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10958; AVX-NEXT: vmovaps 1152(%rdi), %ymm1 10959; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10960; AVX-NEXT: vmovaps 1216(%rdi), %ymm0 10961; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10962; AVX-NEXT: vmovaps 1248(%rdi), %ymm3 10963; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10964; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] 10965; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] 10966; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] 10967; AVX-NEXT: vmovaps 1088(%rdi), %ymm2 10968; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10969; AVX-NEXT: vmovaps 1120(%rdi), %ymm1 10970; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10971; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] 10972; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 10973; AVX-NEXT: vmovaps 1056(%rdi), %ymm3 10974; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10975; AVX-NEXT: vmovaps 1024(%rdi), %ymm2 10976; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10977; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 10978; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 10979; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] 10980; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10981; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10982; AVX-NEXT: vmovaps 1440(%rdi), %ymm2 10983; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10984; AVX-NEXT: vmovaps 1408(%rdi), %ymm3 10985; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10986; AVX-NEXT: vmovaps 1472(%rdi), %ymm1 10987; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10988; AVX-NEXT: vmovaps 1504(%rdi), %ymm0 10989; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10990; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 10991; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 10992; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] 10993; AVX-NEXT: vmovaps 1344(%rdi), %ymm2 10994; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10995; AVX-NEXT: vmovaps 1376(%rdi), %ymm1 10996; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10997; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] 10998; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 10999; AVX-NEXT: vmovaps 1312(%rdi), %ymm3 11000; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11001; AVX-NEXT: vmovaps 1280(%rdi), %ymm2 11002; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11003; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 11004; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11005; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] 11006; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 11007; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11008; AVX-NEXT: vmovaps 1696(%rdi), %ymm2 11009; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11010; AVX-NEXT: vmovaps 1664(%rdi), %ymm3 11011; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11012; AVX-NEXT: vmovaps 1728(%rdi), %ymm1 11013; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11014; AVX-NEXT: vmovaps 1760(%rdi), %ymm0 11015; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11016; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 11017; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 11018; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] 11019; AVX-NEXT: vmovaps 1600(%rdi), %ymm2 11020; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11021; AVX-NEXT: vmovaps 1632(%rdi), %ymm1 11022; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11023; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] 11024; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 11025; AVX-NEXT: vmovaps 1568(%rdi), %ymm3 11026; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11027; AVX-NEXT: vmovaps 1536(%rdi), %ymm2 11028; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11029; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 11030; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11031; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] 11032; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 11033; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11034; AVX-NEXT: vmovaps 1952(%rdi), %ymm2 11035; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11036; AVX-NEXT: vmovaps 1920(%rdi), %ymm3 11037; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11038; AVX-NEXT: vmovaps 1984(%rdi), %ymm1 11039; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11040; AVX-NEXT: vmovaps 2016(%rdi), %ymm0 11041; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11042; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 11043; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 11044; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] 11045; AVX-NEXT: vmovaps 1856(%rdi), %ymm2 11046; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11047; AVX-NEXT: vmovaps 1888(%rdi), %ymm1 11048; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11049; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] 11050; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 11051; AVX-NEXT: vmovaps 1824(%rdi), %ymm3 11052; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11053; AVX-NEXT: vmovaps 1792(%rdi), %ymm2 11054; AVX-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill 11055; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 11056; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11057; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] 11058; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 11059; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11060; AVX-NEXT: vmovaps 160(%rdi), %ymm2 11061; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11062; AVX-NEXT: vmovaps 128(%rdi), %ymm3 11063; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11064; AVX-NEXT: vmovaps 192(%rdi), %ymm1 11065; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11066; AVX-NEXT: vmovaps 224(%rdi), %ymm0 11067; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11068; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 11069; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 11070; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] 11071; AVX-NEXT: vmovaps 64(%rdi), %ymm0 11072; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11073; AVX-NEXT: vmovaps 96(%rdi), %ymm1 11074; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11075; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 11076; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 11077; AVX-NEXT: vmovaps (%rdi), %ymm1 11078; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11079; AVX-NEXT: vmovaps 32(%rdi), %ymm3 11080; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11081; AVX-NEXT: vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] 11082; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11083; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0] 11084; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 11085; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11086; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 11087; AVX-NEXT: vmovaps %ymm4, %ymm5 11088; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11089; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] 11090; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 11091; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11092; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm4[1,0],ymm1[5,4],ymm4[5,4] 11093; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] 11094; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11095; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[4],ymm13[4],ymm6[5],ymm13[5] 11096; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11097; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11098; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11099; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4] 11100; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11101; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3] 11102; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11103; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11104; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11105; AVX-NEXT: vmovaps %ymm12, %ymm13 11106; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11107; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[4],ymm10[4],ymm12[5],ymm10[5] 11108; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11109; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 11110; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,0],ymm9[1,0],ymm12[5,4],ymm9[5,4] 11111; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] 11112; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11113; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[4],ymm11[4],ymm14[5],ymm11[5] 11114; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11115; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 11116; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 11117; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,0],ymm14[1,0],ymm15[5,4],ymm14[5,4] 11118; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11119; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3] 11120; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11121; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11122; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11123; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11124; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 11125; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11126; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11127; AVX-NEXT: # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4] 11128; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] 11129; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11130; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11131; AVX-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] 11132; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11133; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 11134; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload 11135; AVX-NEXT: # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] 11136; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11137; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3] 11138; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11139; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11140; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11141; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11142; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 11143; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11144; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11145; AVX-NEXT: # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4] 11146; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] 11147; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11148; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11149; AVX-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] 11150; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11151; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 11152; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload 11153; AVX-NEXT: # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] 11154; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11155; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3] 11156; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11157; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11158; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11159; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11160; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 11161; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11162; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11163; AVX-NEXT: # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4] 11164; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] 11165; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11166; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11167; AVX-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] 11168; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11169; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 11170; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload 11171; AVX-NEXT: # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] 11172; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11173; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3] 11174; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11175; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11176; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11177; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11178; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 11179; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11180; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11181; AVX-NEXT: # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4] 11182; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] 11183; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11184; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11185; AVX-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] 11186; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11187; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 11188; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload 11189; AVX-NEXT: # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] 11190; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11191; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3] 11192; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11193; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11194; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11195; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11196; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 11197; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11198; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11199; AVX-NEXT: # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4] 11200; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] 11201; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11202; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11203; AVX-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] 11204; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11205; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 11206; AVX-NEXT: vshufps $17, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload 11207; AVX-NEXT: # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] 11208; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11209; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3] 11210; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11211; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11212; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11213; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11214; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 11215; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11216; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11217; AVX-NEXT: # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4] 11218; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] 11219; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11220; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11221; AVX-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] 11222; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11223; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 11224; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload 11225; AVX-NEXT: # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] 11226; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11227; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3] 11228; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11229; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11230; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] 11231; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] 11232; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] 11233; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 11234; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] 11235; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11236; AVX-NEXT: vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] 11237; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11238; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,0] 11239; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11240; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11241; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm13[1],ymm10[3],ymm13[3] 11242; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[6],ymm12[6],ymm9[7],ymm12[7] 11243; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] 11244; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload 11245; AVX-NEXT: # ymm2 = ymm11[1],mem[1],ymm11[3],mem[3] 11246; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11247; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11248; AVX-NEXT: vunpckhps {{.*#+}} ymm15 = ymm14[2],ymm1[2],ymm14[3],ymm1[3],ymm14[6],ymm1[6],ymm14[7],ymm1[7] 11249; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11250; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,0] 11251; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11252; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11253; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 11254; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 11255; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] 11256; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 11257; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 11258; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] 11259; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] 11260; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 11261; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11262; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] 11263; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11264; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 11265; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 11266; AVX-NEXT: vunpckhps {{.*#+}} ymm15 = ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[6],ymm7[6],ymm10[7],ymm7[7] 11267; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11268; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,0] 11269; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11270; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11271; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 11272; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload 11273; AVX-NEXT: # ymm0 = ymm13[1],mem[1],ymm13[3],mem[3] 11274; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 11275; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 11276; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[6],ymm12[6],ymm14[7],ymm12[7] 11277; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] 11278; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11279; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11280; AVX-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] 11281; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11282; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 11283; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload 11284; AVX-NEXT: # ymm15 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] 11285; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11286; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,0] 11287; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11288; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11289; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11290; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11291; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] 11292; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11293; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11294; AVX-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] 11295; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] 11296; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11297; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11298; AVX-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] 11299; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11300; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 11301; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload 11302; AVX-NEXT: # ymm15 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] 11303; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11304; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,0] 11305; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11306; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11307; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11308; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11309; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] 11310; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11311; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11312; AVX-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] 11313; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] 11314; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11315; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11316; AVX-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] 11317; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11318; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 11319; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload 11320; AVX-NEXT: # ymm15 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] 11321; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11322; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,0] 11323; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11324; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11325; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11326; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11327; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] 11328; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11329; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11330; AVX-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] 11331; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] 11332; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11333; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11334; AVX-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] 11335; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11336; AVX-NEXT: vmovups (%rsp), %ymm15 # 32-byte Reload 11337; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload 11338; AVX-NEXT: # ymm15 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] 11339; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11340; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,0] 11341; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11342; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11343; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11344; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11345; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] 11346; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11347; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11348; AVX-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] 11349; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] 11350; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11351; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11352; AVX-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] 11353; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11354; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 11355; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload 11356; AVX-NEXT: # ymm15 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] 11357; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11358; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,0] 11359; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11360; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11361; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11362; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11363; AVX-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 11364; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11365; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11366; AVX-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] 11367; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] 11368; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11369; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] 11370; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 11371; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload 11372; AVX-NEXT: # ymm15 = ymm4[3,0],mem[3,0],ymm4[7,4],mem[7,4] 11373; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11374; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11375; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3] 11376; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11377; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11378; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11379; AVX-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 11380; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11381; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11382; AVX-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] 11383; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] 11384; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11385; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11386; AVX-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] 11387; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload 11388; AVX-NEXT: # ymm15 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] 11389; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11390; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11391; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3] 11392; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11393; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11394; AVX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7] 11395; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm9[3,0],ymm3[7,4],ymm9[7,4] 11396; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] 11397; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] 11398; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm7[3,0],ymm10[3,0],ymm7[7,4],ymm10[7,4] 11399; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11400; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11401; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3] 11402; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11403; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11404; AVX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] 11405; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm12[3,0],ymm14[3,0],ymm12[7,4],ymm14[7,4] 11406; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] 11407; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11408; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload 11409; AVX-NEXT: # ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 11410; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11411; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload 11412; AVX-NEXT: # ymm15 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] 11413; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11414; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11415; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3] 11416; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11417; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11418; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11419; AVX-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 11420; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11421; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload 11422; AVX-NEXT: # ymm2 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] 11423; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] 11424; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11425; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload 11426; AVX-NEXT: # ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 11427; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11428; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload 11429; AVX-NEXT: # ymm15 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] 11430; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11431; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11432; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3] 11433; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11434; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11435; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11436; AVX-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 11437; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11438; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload 11439; AVX-NEXT: # ymm2 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] 11440; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] 11441; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11442; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload 11443; AVX-NEXT: # ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 11444; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11445; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload 11446; AVX-NEXT: # ymm15 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] 11447; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11448; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 11449; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,0],xmm2[2,3] 11450; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11451; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11452; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11453; AVX-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 11454; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11455; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload 11456; AVX-NEXT: # ymm2 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] 11457; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] 11458; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11459; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload 11460; AVX-NEXT: # ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 11461; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11462; AVX-NEXT: vshufps $51, (%rsp), %ymm1, %ymm7 # 32-byte Folded Reload 11463; AVX-NEXT: # ymm7 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] 11464; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11465; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7 11466; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,0],xmm2[2,3] 11467; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11468; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11469; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11470; AVX-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 11471; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11472; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload 11473; AVX-NEXT: # ymm2 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] 11474; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] 11475; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11476; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload 11477; AVX-NEXT: # ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 11478; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11479; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 11480; AVX-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] 11481; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 11482; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 11483; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] 11484; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 11485; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11486; AVX-NEXT: vmovaps %ymm1, 192(%rsi) 11487; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11488; AVX-NEXT: vmovaps %ymm1, 128(%rsi) 11489; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11490; AVX-NEXT: vmovaps %ymm1, 64(%rsi) 11491; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11492; AVX-NEXT: vmovaps %ymm1, (%rsi) 11493; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11494; AVX-NEXT: vmovaps %ymm1, 224(%rsi) 11495; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11496; AVX-NEXT: vmovaps %ymm1, 160(%rsi) 11497; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11498; AVX-NEXT: vmovaps %ymm1, 96(%rsi) 11499; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11500; AVX-NEXT: vmovaps %ymm1, 32(%rsi) 11501; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11502; AVX-NEXT: vmovaps %ymm1, 192(%rdx) 11503; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11504; AVX-NEXT: vmovaps %ymm1, 128(%rdx) 11505; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11506; AVX-NEXT: vmovaps %ymm1, 64(%rdx) 11507; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11508; AVX-NEXT: vmovaps %ymm1, (%rdx) 11509; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11510; AVX-NEXT: vmovaps %ymm1, 224(%rdx) 11511; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11512; AVX-NEXT: vmovaps %ymm1, 160(%rdx) 11513; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11514; AVX-NEXT: vmovaps %ymm1, 96(%rdx) 11515; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11516; AVX-NEXT: vmovaps %ymm1, 32(%rdx) 11517; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11518; AVX-NEXT: vmovaps %ymm1, 192(%rcx) 11519; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11520; AVX-NEXT: vmovaps %ymm1, 128(%rcx) 11521; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11522; AVX-NEXT: vmovaps %ymm1, 64(%rcx) 11523; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11524; AVX-NEXT: vmovaps %ymm1, (%rcx) 11525; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11526; AVX-NEXT: vmovaps %ymm1, 224(%rcx) 11527; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11528; AVX-NEXT: vmovaps %ymm1, 160(%rcx) 11529; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11530; AVX-NEXT: vmovaps %ymm1, 96(%rcx) 11531; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11532; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 11533; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11534; AVX-NEXT: vmovaps %ymm1, 192(%r8) 11535; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11536; AVX-NEXT: vmovaps %ymm1, 128(%r8) 11537; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11538; AVX-NEXT: vmovaps %ymm1, 64(%r8) 11539; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11540; AVX-NEXT: vmovaps %ymm1, (%r8) 11541; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11542; AVX-NEXT: vmovaps %ymm1, 224(%r8) 11543; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11544; AVX-NEXT: vmovaps %ymm1, 160(%r8) 11545; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11546; AVX-NEXT: vmovaps %ymm1, 96(%r8) 11547; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11548; AVX-NEXT: vmovaps %ymm1, 32(%r8) 11549; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11550; AVX-NEXT: vmovaps %ymm1, 224(%r9) 11551; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11552; AVX-NEXT: vmovaps %ymm1, 192(%r9) 11553; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11554; AVX-NEXT: vmovaps %ymm1, 160(%r9) 11555; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11556; AVX-NEXT: vmovaps %ymm1, 128(%r9) 11557; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11558; AVX-NEXT: vmovaps %ymm1, 96(%r9) 11559; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11560; AVX-NEXT: vmovaps %ymm1, 64(%r9) 11561; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11562; AVX-NEXT: vmovaps %ymm1, 32(%r9) 11563; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11564; AVX-NEXT: vmovaps %ymm1, (%r9) 11565; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 11566; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11567; AVX-NEXT: vmovaps %ymm1, 224(%rax) 11568; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11569; AVX-NEXT: vmovaps %ymm1, 192(%rax) 11570; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11571; AVX-NEXT: vmovaps %ymm1, 160(%rax) 11572; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11573; AVX-NEXT: vmovaps %ymm1, 128(%rax) 11574; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11575; AVX-NEXT: vmovaps %ymm1, 96(%rax) 11576; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11577; AVX-NEXT: vmovaps %ymm1, 64(%rax) 11578; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11579; AVX-NEXT: vmovaps %ymm1, 32(%rax) 11580; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11581; AVX-NEXT: vmovaps %ymm1, (%rax) 11582; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 11583; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11584; AVX-NEXT: vmovaps %ymm1, 224(%rax) 11585; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11586; AVX-NEXT: vmovaps %ymm1, 192(%rax) 11587; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11588; AVX-NEXT: vmovaps %ymm1, 160(%rax) 11589; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11590; AVX-NEXT: vmovaps %ymm1, 128(%rax) 11591; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11592; AVX-NEXT: vmovaps %ymm1, 96(%rax) 11593; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11594; AVX-NEXT: vmovaps %ymm1, 64(%rax) 11595; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11596; AVX-NEXT: vmovaps %ymm1, 32(%rax) 11597; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11598; AVX-NEXT: vmovaps %ymm1, (%rax) 11599; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 11600; AVX-NEXT: vmovaps %ymm7, 224(%rax) 11601; AVX-NEXT: vmovaps %ymm15, 192(%rax) 11602; AVX-NEXT: vmovaps %ymm6, 160(%rax) 11603; AVX-NEXT: vmovaps %ymm5, 128(%rax) 11604; AVX-NEXT: vmovaps %ymm3, 96(%rax) 11605; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11606; AVX-NEXT: vmovaps %ymm1, 64(%rax) 11607; AVX-NEXT: vmovaps %ymm4, 32(%rax) 11608; AVX-NEXT: vmovaps %ymm0, (%rax) 11609; AVX-NEXT: addq $3720, %rsp # imm = 0xE88 11610; AVX-NEXT: vzeroupper 11611; AVX-NEXT: retq 11612; 11613; AVX2-LABEL: load_i32_stride8_vf64: 11614; AVX2: # %bb.0: 11615; AVX2-NEXT: subq $3528, %rsp # imm = 0xDC8 11616; AVX2-NEXT: vmovaps 288(%rdi), %xmm10 11617; AVX2-NEXT: vmovaps 256(%rdi), %xmm0 11618; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11619; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] 11620; AVX2-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11621; AVX2-NEXT: vmovaps 352(%rdi), %xmm9 11622; AVX2-NEXT: vbroadcastss %xmm9, %xmm1 11623; AVX2-NEXT: vmovaps 320(%rdi), %xmm2 11624; AVX2-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill 11625; AVX2-NEXT: vbroadcastss %xmm2, %xmm2 11626; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 11627; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 11628; AVX2-NEXT: vmovaps 416(%rdi), %xmm1 11629; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11630; AVX2-NEXT: vmovaps 384(%rdi), %xmm2 11631; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11632; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 11633; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 11634; AVX2-NEXT: vmovaps 480(%rdi), %xmm13 11635; AVX2-NEXT: vbroadcastss %xmm13, %xmm2 11636; AVX2-NEXT: vmovaps 448(%rdi), %xmm3 11637; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11638; AVX2-NEXT: vbroadcastss %xmm3, %xmm3 11639; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 11640; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 11641; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] 11642; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 11643; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11644; AVX2-NEXT: vmovaps 800(%rdi), %xmm0 11645; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11646; AVX2-NEXT: vmovaps 768(%rdi), %xmm1 11647; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11648; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 11649; AVX2-NEXT: vmovaps 864(%rdi), %xmm12 11650; AVX2-NEXT: vbroadcastss %xmm12, %xmm1 11651; AVX2-NEXT: vmovaps 832(%rdi), %xmm2 11652; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11653; AVX2-NEXT: vbroadcastss %xmm2, %xmm2 11654; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 11655; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 11656; AVX2-NEXT: vmovaps 992(%rdi), %xmm1 11657; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11658; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 11659; AVX2-NEXT: vmovaps 960(%rdi), %xmm2 11660; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11661; AVX2-NEXT: vbroadcastss %xmm2, %xmm2 11662; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 11663; AVX2-NEXT: vmovaps 928(%rdi), %xmm2 11664; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11665; AVX2-NEXT: vmovaps 896(%rdi), %xmm3 11666; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11667; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 11668; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 11669; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 11670; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 11671; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 11672; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11673; AVX2-NEXT: vmovaps 1376(%rdi), %xmm0 11674; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11675; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 11676; AVX2-NEXT: vmovaps 1344(%rdi), %xmm1 11677; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11678; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 11679; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 11680; AVX2-NEXT: vmovaps 1312(%rdi), %xmm1 11681; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11682; AVX2-NEXT: vmovaps 1280(%rdi), %xmm2 11683; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11684; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 11685; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 11686; AVX2-NEXT: vmovaps 1504(%rdi), %xmm1 11687; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11688; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 11689; AVX2-NEXT: vmovaps 1472(%rdi), %xmm2 11690; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11691; AVX2-NEXT: vbroadcastss %xmm2, %xmm2 11692; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 11693; AVX2-NEXT: vmovaps 1440(%rdi), %xmm2 11694; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11695; AVX2-NEXT: vmovaps 1408(%rdi), %xmm3 11696; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11697; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 11698; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 11699; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 11700; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 11701; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 11702; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11703; AVX2-NEXT: vmovaps 1888(%rdi), %xmm0 11704; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11705; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 11706; AVX2-NEXT: vmovaps 1856(%rdi), %xmm1 11707; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11708; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 11709; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 11710; AVX2-NEXT: vmovaps 1824(%rdi), %xmm1 11711; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11712; AVX2-NEXT: vmovaps 1792(%rdi), %xmm2 11713; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11714; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 11715; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] 11716; AVX2-NEXT: vmovaps 2016(%rdi), %xmm0 11717; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11718; AVX2-NEXT: vbroadcastss %xmm0, %xmm1 11719; AVX2-NEXT: vmovaps 1984(%rdi), %xmm0 11720; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11721; AVX2-NEXT: vbroadcastss %xmm0, %xmm2 11722; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 11723; AVX2-NEXT: vmovaps 1952(%rdi), %xmm0 11724; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11725; AVX2-NEXT: vmovaps 1920(%rdi), %xmm2 11726; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11727; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 11728; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 11729; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 11730; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 11731; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] 11732; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11733; AVX2-NEXT: vmovaps 608(%rdi), %xmm0 11734; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11735; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 11736; AVX2-NEXT: vmovaps 576(%rdi), %xmm1 11737; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11738; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 11739; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 11740; AVX2-NEXT: vmovaps 544(%rdi), %xmm2 11741; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11742; AVX2-NEXT: vmovaps 512(%rdi), %xmm1 11743; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11744; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 11745; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 11746; AVX2-NEXT: vmovaps 736(%rdi), %xmm1 11747; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11748; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 11749; AVX2-NEXT: vmovaps 704(%rdi), %xmm2 11750; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11751; AVX2-NEXT: vbroadcastss %xmm2, %xmm2 11752; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 11753; AVX2-NEXT: vmovaps 672(%rdi), %xmm3 11754; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11755; AVX2-NEXT: vmovaps 640(%rdi), %xmm2 11756; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11757; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 11758; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 11759; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 11760; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 11761; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 11762; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11763; AVX2-NEXT: vmovaps 1120(%rdi), %xmm0 11764; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11765; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 11766; AVX2-NEXT: vmovaps 1088(%rdi), %xmm1 11767; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11768; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 11769; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 11770; AVX2-NEXT: vmovaps 1056(%rdi), %xmm2 11771; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11772; AVX2-NEXT: vmovaps 1024(%rdi), %xmm1 11773; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11774; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 11775; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 11776; AVX2-NEXT: vmovaps 1248(%rdi), %xmm1 11777; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11778; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 11779; AVX2-NEXT: vmovaps 1216(%rdi), %xmm2 11780; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11781; AVX2-NEXT: vbroadcastss %xmm2, %xmm2 11782; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 11783; AVX2-NEXT: vmovaps 1184(%rdi), %xmm3 11784; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11785; AVX2-NEXT: vmovaps 1152(%rdi), %xmm2 11786; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11787; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 11788; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 11789; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 11790; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 11791; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 11792; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11793; AVX2-NEXT: vmovaps 1632(%rdi), %xmm0 11794; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11795; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 11796; AVX2-NEXT: vmovaps 1600(%rdi), %xmm1 11797; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11798; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 11799; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 11800; AVX2-NEXT: vmovaps 1568(%rdi), %xmm2 11801; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11802; AVX2-NEXT: vmovaps 1536(%rdi), %xmm1 11803; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11804; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 11805; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 11806; AVX2-NEXT: vmovaps 1760(%rdi), %xmm1 11807; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11808; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 11809; AVX2-NEXT: vmovaps 1728(%rdi), %xmm2 11810; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11811; AVX2-NEXT: vbroadcastss %xmm2, %xmm2 11812; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 11813; AVX2-NEXT: vmovaps 1696(%rdi), %xmm3 11814; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11815; AVX2-NEXT: vmovaps 1664(%rdi), %xmm2 11816; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11817; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 11818; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 11819; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 11820; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 11821; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 11822; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11823; AVX2-NEXT: vmovaps 224(%rdi), %xmm0 11824; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11825; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 11826; AVX2-NEXT: vmovaps 192(%rdi), %xmm11 11827; AVX2-NEXT: vbroadcastss %xmm11, %xmm1 11828; AVX2-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11829; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 11830; AVX2-NEXT: vmovaps 160(%rdi), %xmm2 11831; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11832; AVX2-NEXT: vmovaps 128(%rdi), %xmm1 11833; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11834; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 11835; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 11836; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 11837; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7] 11838; AVX2-NEXT: vmovaps 96(%rdi), %xmm8 11839; AVX2-NEXT: vbroadcastss %xmm8, %xmm1 11840; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11841; AVX2-NEXT: vmovaps 64(%rdi), %xmm7 11842; AVX2-NEXT: vbroadcastss %xmm7, %xmm2 11843; AVX2-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11844; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 11845; AVX2-NEXT: vmovaps (%rdi), %xmm5 11846; AVX2-NEXT: vmovaps 32(%rdi), %xmm6 11847; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 11848; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11849; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11850; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3] 11851; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7] 11852; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11853; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 11854; AVX2-NEXT: # xmm0 = mem[1,1,1,1] 11855; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] 11856; AVX2-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11857; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 11858; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] 11859; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 11860; AVX2-NEXT: vmovaps %xmm13, %xmm9 11861; AVX2-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11862; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 11863; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] 11864; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 11865; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 11866; AVX2-NEXT: # xmm2 = mem[1,1,1,1] 11867; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 11868; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 11869; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 11870; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 11871; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 11872; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11873; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 11874; AVX2-NEXT: # xmm0 = mem[1,1,1,1] 11875; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 11876; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 11877; AVX2-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11878; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 11879; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] 11880; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 11881; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 11882; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 11883; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] 11884; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 11885; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 11886; AVX2-NEXT: # xmm2 = mem[1,1,1,1] 11887; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 11888; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 11889; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 11890; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 11891; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 11892; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11893; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 11894; AVX2-NEXT: # xmm0 = mem[1,1,1,1] 11895; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 11896; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 11897; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 11898; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 11899; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 11900; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 11901; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 11902; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 11903; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 11904; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 11905; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 11906; AVX2-NEXT: # xmm2 = mem[1,1,1,1] 11907; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 11908; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 11909; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 11910; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 11911; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 11912; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11913; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 11914; AVX2-NEXT: # xmm0 = mem[1,1,1,1] 11915; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 11916; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 11917; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 11918; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 11919; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 11920; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 11921; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 11922; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 11923; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 11924; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 11925; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 11926; AVX2-NEXT: # xmm2 = mem[1,1,1,1] 11927; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 11928; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 11929; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 11930; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 11931; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 11932; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11933; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] 11934; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] 11935; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] 11936; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 11937; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload 11938; AVX2-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] 11939; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 11940; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 11941; AVX2-NEXT: # xmm2 = mem[1,1,1,1] 11942; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 11943; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 11944; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 11945; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 11946; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 11947; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11948; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 11949; AVX2-NEXT: # xmm0 = mem[1,1,1,1] 11950; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 11951; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 11952; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 11953; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 11954; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 11955; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 11956; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 11957; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 11958; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 11959; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 11960; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 11961; AVX2-NEXT: # xmm2 = mem[1,1,1,1] 11962; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 11963; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 11964; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 11965; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 11966; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 11967; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11968; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 11969; AVX2-NEXT: # xmm0 = mem[1,1,1,1] 11970; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 11971; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 11972; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 11973; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 11974; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 11975; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 11976; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 11977; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 11978; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] 11979; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 11980; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 11981; AVX2-NEXT: # xmm2 = mem[1,1,1,1] 11982; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 11983; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 11984; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 11985; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 11986; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 11987; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11988; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 11989; AVX2-NEXT: # xmm0 = mem[1,1,1,1] 11990; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 11991; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 11992; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 11993; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 11994; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 11995; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 11996; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 11997; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 11998; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 11999; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 12000; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 12001; AVX2-NEXT: # xmm2 = mem[1,1,1,1] 12002; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 12003; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 12004; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 12005; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 12006; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 12007; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12008; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 12009; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload 12010; AVX2-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] 12011; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12012; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 12013; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 12014; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 12015; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12016; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2] 12017; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] 12018; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 12019; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 12020; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 12021; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 12022; AVX2-NEXT: # xmm1 = mem[2,2,2,2] 12023; AVX2-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload 12024; AVX2-NEXT: # xmm1 = mem[0,1,2],xmm1[3] 12025; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] 12026; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12027; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12028; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 12029; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload 12030; AVX2-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] 12031; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12032; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 12033; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 12034; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 12035; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12036; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2] 12037; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] 12038; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 12039; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 12040; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 12041; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 12042; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2] 12043; AVX2-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 12044; AVX2-NEXT: # xmm1 = mem[0,1,2],xmm1[3] 12045; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 12046; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12047; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12048; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 12049; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload 12050; AVX2-NEXT: # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3] 12051; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 12052; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 12053; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 12054; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12055; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2] 12056; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] 12057; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 12058; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 12059; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 12060; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 12061; AVX2-NEXT: # xmm1 = mem[2,2,2,2] 12062; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 12063; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] 12064; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] 12065; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12066; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12067; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 12068; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload 12069; AVX2-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] 12070; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 12071; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 12072; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 12073; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12074; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 12075; AVX2-NEXT: # xmm0 = mem[2,2,2,2] 12076; AVX2-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 12077; AVX2-NEXT: # xmm0 = mem[0,1,2],xmm0[3] 12078; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 12079; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 12080; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 12081; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 12082; AVX2-NEXT: # xmm1 = mem[2,2,2,2] 12083; AVX2-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 12084; AVX2-NEXT: # xmm1 = mem[0,1,2],xmm1[3] 12085; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] 12086; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12087; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12088; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 12089; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload 12090; AVX2-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] 12091; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12092; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 12093; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 12094; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 12095; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12096; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,2,2,2] 12097; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] 12098; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 12099; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 12100; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 12101; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 12102; AVX2-NEXT: # xmm1 = mem[2,2,2,2] 12103; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 12104; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] 12105; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 12106; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12107; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12108; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 12109; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload 12110; AVX2-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] 12111; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12112; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 12113; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 12114; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 12115; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12116; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] 12117; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] 12118; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 12119; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 12120; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 12121; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 12122; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] 12123; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 12124; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] 12125; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 12126; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12127; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12128; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 12129; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload 12130; AVX2-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3] 12131; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12132; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 12133; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 12134; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 12135; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12136; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 12137; AVX2-NEXT: # xmm0 = mem[2,2,2,2] 12138; AVX2-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 12139; AVX2-NEXT: # xmm0 = mem[0,1,2],xmm0[3] 12140; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 12141; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15 12142; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] 12143; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 12144; AVX2-NEXT: vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2] 12145; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 12146; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] 12147; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm5[0,1],xmm15[2,3] 12148; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] 12149; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12150; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 12151; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload 12152; AVX2-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3] 12153; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12154; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 12155; AVX2-NEXT: # xmm15 = mem[2,2,2,2] 12156; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 12157; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3] 12158; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 12159; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm14 12160; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 12161; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 12162; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload 12163; AVX2-NEXT: # xmm15 = xmm5[2],mem[2],xmm5[3],mem[3] 12164; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 12165; AVX2-NEXT: # xmm13 = mem[2,2,2,2] 12166; AVX2-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 12167; AVX2-NEXT: # xmm13 = mem[0,1,2],xmm13[3] 12168; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] 12169; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm14[4,5,6,7] 12170; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12171; AVX2-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload 12172; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload 12173; AVX2-NEXT: # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3] 12174; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 12175; AVX2-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm5[1],xmm13[1] 12176; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 12177; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload 12178; AVX2-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3] 12179; AVX2-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 12180; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 12181; AVX2-NEXT: # xmm12 = mem[2,3,2,3] 12182; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 12183; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] 12184; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] 12185; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12186; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 12187; AVX2-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] 12188; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 12189; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm12[1] 12190; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 12191; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload 12192; AVX2-NEXT: # xmm12 = xmm5[2],mem[2],xmm5[3],mem[3] 12193; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 12194; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 12195; AVX2-NEXT: # xmm13 = mem[2,3,2,3] 12196; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 12197; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 12198; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] 12199; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12200; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm11 # 16-byte Folded Reload 12201; AVX2-NEXT: # xmm11 = xmm7[2],mem[2],xmm7[3],mem[3] 12202; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm11[1] 12203; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 12204; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload 12205; AVX2-NEXT: # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3] 12206; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 12207; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 12208; AVX2-NEXT: # xmm10 = mem[2,3,2,3] 12209; AVX2-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 12210; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] 12211; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] 12212; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12213; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 12214; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload 12215; AVX2-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3] 12216; AVX2-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm9[1] 12217; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 12218; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload 12219; AVX2-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3] 12220; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 12221; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 12222; AVX2-NEXT: # xmm8 = mem[2,3,2,3] 12223; AVX2-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 12224; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] 12225; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 12226; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12227; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload 12228; AVX2-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] 12229; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 12230; AVX2-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] 12231; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 12232; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload 12233; AVX2-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] 12234; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 12235; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 12236; AVX2-NEXT: # xmm6 = mem[2,3,2,3] 12237; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 12238; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] 12239; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] 12240; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12241; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] 12242; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 12243; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] 12244; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 12245; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload 12246; AVX2-NEXT: # xmm5 = xmm4[2],mem[2],xmm4[3],mem[3] 12247; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 12248; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 12249; AVX2-NEXT: # xmm4 = mem[2,3,2,3] 12250; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 12251; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 12252; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] 12253; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12254; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 12255; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 12256; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 12257; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 12258; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload 12259; AVX2-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] 12260; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 12261; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 12262; AVX2-NEXT: # xmm2 = mem[2,3,2,3] 12263; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 12264; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 12265; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 12266; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12267; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 12268; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 12269; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 12270; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 12271; AVX2-NEXT: # xmm0 = mem[2,3,2,3] 12272; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 12273; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 12274; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 12275; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 12276; AVX2-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] 12277; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] 12278; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12279; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12280; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 12281; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12282; AVX2-NEXT: vmovaps (%rdi), %ymm1 12283; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12284; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 12285; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 12286; AVX2-NEXT: vmovaps 96(%rdi), %ymm1 12287; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12288; AVX2-NEXT: vmovaps 64(%rdi), %ymm2 12289; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12290; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 12291; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12292; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 12293; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 12294; AVX2-NEXT: vmovaps 224(%rdi), %ymm2 12295; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12296; AVX2-NEXT: vmovaps 192(%rdi), %ymm3 12297; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12298; AVX2-NEXT: vmovaps 160(%rdi), %ymm15 12299; AVX2-NEXT: vmovaps 128(%rdi), %ymm1 12300; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12301; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] 12302; AVX2-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12303; AVX2-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 12304; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12305; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] 12306; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 12307; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12308; AVX2-NEXT: vmovaps 288(%rdi), %ymm0 12309; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12310; AVX2-NEXT: vmovaps 256(%rdi), %ymm1 12311; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12312; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 12313; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 12314; AVX2-NEXT: vmovaps 352(%rdi), %ymm1 12315; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12316; AVX2-NEXT: vmovaps 320(%rdi), %ymm2 12317; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12318; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 12319; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12320; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 12321; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 12322; AVX2-NEXT: vmovaps 480(%rdi), %ymm2 12323; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12324; AVX2-NEXT: vmovaps 448(%rdi), %ymm3 12325; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12326; AVX2-NEXT: vmovaps 416(%rdi), %ymm4 12327; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12328; AVX2-NEXT: vmovaps 384(%rdi), %ymm1 12329; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12330; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] 12331; AVX2-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 12332; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] 12333; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 12334; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12335; AVX2-NEXT: vmovaps 544(%rdi), %ymm0 12336; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12337; AVX2-NEXT: vmovaps 512(%rdi), %ymm1 12338; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12339; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 12340; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 12341; AVX2-NEXT: vmovaps 608(%rdi), %ymm1 12342; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12343; AVX2-NEXT: vmovaps 576(%rdi), %ymm2 12344; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12345; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 12346; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12347; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 12348; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 12349; AVX2-NEXT: vmovaps 736(%rdi), %ymm2 12350; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12351; AVX2-NEXT: vmovaps 704(%rdi), %ymm3 12352; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12353; AVX2-NEXT: vmovaps 672(%rdi), %ymm4 12354; AVX2-NEXT: vmovaps 640(%rdi), %ymm1 12355; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12356; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] 12357; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12358; AVX2-NEXT: vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 12359; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] 12360; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 12361; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12362; AVX2-NEXT: vmovaps 800(%rdi), %ymm0 12363; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12364; AVX2-NEXT: vmovaps 768(%rdi), %ymm1 12365; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12366; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 12367; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 12368; AVX2-NEXT: vmovaps 864(%rdi), %ymm1 12369; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12370; AVX2-NEXT: vmovaps 832(%rdi), %ymm2 12371; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12372; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 12373; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12374; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 12375; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 12376; AVX2-NEXT: vmovaps 992(%rdi), %ymm2 12377; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12378; AVX2-NEXT: vmovaps 960(%rdi), %ymm5 12379; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12380; AVX2-NEXT: vmovaps 928(%rdi), %ymm3 12381; AVX2-NEXT: vmovaps 896(%rdi), %ymm1 12382; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12383; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] 12384; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12385; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] 12386; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] 12387; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 12388; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12389; AVX2-NEXT: vmovaps 1056(%rdi), %ymm0 12390; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12391; AVX2-NEXT: vmovaps 1024(%rdi), %ymm1 12392; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12393; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 12394; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 12395; AVX2-NEXT: vmovaps 1120(%rdi), %ymm1 12396; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12397; AVX2-NEXT: vmovaps 1088(%rdi), %ymm2 12398; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12399; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 12400; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12401; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 12402; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 12403; AVX2-NEXT: vmovaps 1248(%rdi), %ymm1 12404; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12405; AVX2-NEXT: vmovaps 1216(%rdi), %ymm7 12406; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12407; AVX2-NEXT: vmovaps 1184(%rdi), %ymm2 12408; AVX2-NEXT: vmovaps 1152(%rdi), %ymm6 12409; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12410; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] 12411; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12412; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[4],ymm1[4],ymm7[5],ymm1[5] 12413; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] 12414; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] 12415; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12416; AVX2-NEXT: vmovaps 1312(%rdi), %ymm0 12417; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12418; AVX2-NEXT: vmovaps 1280(%rdi), %ymm6 12419; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12420; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] 12421; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm6 12422; AVX2-NEXT: vmovaps 1376(%rdi), %ymm0 12423; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12424; AVX2-NEXT: vmovaps 1344(%rdi), %ymm7 12425; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12426; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5] 12427; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12428; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2] 12429; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] 12430; AVX2-NEXT: vmovaps 1504(%rdi), %ymm6 12431; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12432; AVX2-NEXT: vmovaps 1472(%rdi), %ymm10 12433; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12434; AVX2-NEXT: vmovaps 1440(%rdi), %ymm0 12435; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 12436; AVX2-NEXT: vmovaps 1408(%rdi), %ymm9 12437; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12438; AVX2-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] 12439; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] 12440; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] 12441; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] 12442; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12443; AVX2-NEXT: vmovaps 1568(%rdi), %ymm6 12444; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12445; AVX2-NEXT: vmovaps 1536(%rdi), %ymm7 12446; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12447; AVX2-NEXT: vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] 12448; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm9 12449; AVX2-NEXT: vmovaps 1632(%rdi), %ymm6 12450; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12451; AVX2-NEXT: vmovaps 1600(%rdi), %ymm7 12452; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12453; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] 12454; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12455; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,2,2,2] 12456; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0,1],xmm10[2,3] 12457; AVX2-NEXT: vmovaps 1760(%rdi), %ymm9 12458; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12459; AVX2-NEXT: vmovaps 1728(%rdi), %ymm6 12460; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12461; AVX2-NEXT: vmovaps 1696(%rdi), %ymm7 12462; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12463; AVX2-NEXT: vmovaps 1664(%rdi), %ymm11 12464; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12465; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5] 12466; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] 12467; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12468; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[2],ymm6[2] 12469; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] 12470; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12471; AVX2-NEXT: vmovaps 1824(%rdi), %ymm6 12472; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12473; AVX2-NEXT: vmovaps 1792(%rdi), %ymm7 12474; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12475; AVX2-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] 12476; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm11 12477; AVX2-NEXT: vmovaps 1888(%rdi), %ymm6 12478; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12479; AVX2-NEXT: vmovaps 1856(%rdi), %ymm7 12480; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12481; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] 12482; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12483; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[2,2,2,2] 12484; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm13[2,3] 12485; AVX2-NEXT: vmovaps 2016(%rdi), %ymm11 12486; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12487; AVX2-NEXT: vmovaps 1984(%rdi), %ymm6 12488; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12489; AVX2-NEXT: vmovaps 1952(%rdi), %ymm7 12490; AVX2-NEXT: vmovaps 1920(%rdi), %ymm9 12491; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12492; AVX2-NEXT: vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] 12493; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12494; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5] 12495; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] 12496; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] 12497; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12498; AVX2-NEXT: vbroadcastss 148(%rdi), %ymm13 12499; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] 12500; AVX2-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload 12501; AVX2-NEXT: # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] 12502; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 12503; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm14 12504; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 12505; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] 12506; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 12507; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7] 12508; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15 12509; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] 12510; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] 12511; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12512; AVX2-NEXT: vbroadcastss 404(%rdi), %ymm13 12513; AVX2-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload 12514; AVX2-NEXT: # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7] 12515; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 12516; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 12517; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm13 12518; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 12519; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5] 12520; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 12521; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] 12522; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm14 12523; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] 12524; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] 12525; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12526; AVX2-NEXT: vbroadcastss 660(%rdi), %ymm12 12527; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7] 12528; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7] 12529; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 12530; AVX2-NEXT: vextractf128 $1, %ymm8, %xmm8 12531; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 12532; AVX2-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5] 12533; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 12534; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] 12535; AVX2-NEXT: vextractf128 $1, %ymm12, %xmm12 12536; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] 12537; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] 12538; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12539; AVX2-NEXT: vbroadcastss 916(%rdi), %ymm4 12540; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] 12541; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7] 12542; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 12543; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4 12544; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 12545; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] 12546; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 12547; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7] 12548; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 12549; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] 12550; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 12551; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12552; AVX2-NEXT: vbroadcastss 1172(%rdi), %ymm3 12553; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] 12554; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] 12555; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12556; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 12557; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 12558; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5] 12559; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12560; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] 12561; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3 12562; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] 12563; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 12564; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12565; AVX2-NEXT: vbroadcastss 1428(%rdi), %ymm1 12566; AVX2-NEXT: vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload 12567; AVX2-NEXT: # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7] 12568; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] 12569; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12570; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 12571; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 12572; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] 12573; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 12574; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] 12575; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 12576; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] 12577; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 12578; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12579; AVX2-NEXT: vbroadcastss 1684(%rdi), %ymm0 12580; AVX2-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12581; AVX2-NEXT: # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7] 12582; AVX2-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12583; AVX2-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 12584; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12585; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 12586; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 12587; AVX2-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] 12588; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 12589; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] 12590; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 12591; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 12592; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12593; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12594; AVX2-NEXT: vbroadcastss 1940(%rdi), %ymm0 12595; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] 12596; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] 12597; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12598; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 12599; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 12600; AVX2-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] 12601; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 12602; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] 12603; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 12604; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 12605; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12606; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12607; AVX2-NEXT: vbroadcastss 248(%rdi), %ymm0 12608; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12609; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 12610; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12611; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload 12612; AVX2-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 12613; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12614; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] 12615; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12616; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12617; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 12618; AVX2-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 12619; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12620; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1 12621; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2] 12622; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 12623; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] 12624; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12625; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12626; AVX2-NEXT: vbroadcastss 504(%rdi), %ymm0 12627; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12628; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 12629; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12630; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 12631; AVX2-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 12632; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12633; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[6],ymm10[6],ymm15[7],ymm10[7] 12634; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12635; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12636; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload 12637; AVX2-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 12638; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12639; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1 12640; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] 12641; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 12642; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] 12643; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12644; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12645; AVX2-NEXT: vbroadcastss 760(%rdi), %ymm0 12646; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12647; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 12648; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12649; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 12650; AVX2-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 12651; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12652; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] 12653; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12654; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12655; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload 12656; AVX2-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 12657; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12658; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1 12659; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] 12660; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 12661; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] 12662; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12663; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12664; AVX2-NEXT: vbroadcastss 1016(%rdi), %ymm0 12665; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12666; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 12667; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12668; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 12669; AVX2-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 12670; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12671; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7] 12672; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12673; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12674; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload 12675; AVX2-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 12676; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1 12677; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm14[2,2,2,2] 12678; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 12679; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] 12680; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12681; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12682; AVX2-NEXT: vbroadcastss 1272(%rdi), %ymm0 12683; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12684; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 12685; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12686; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload 12687; AVX2-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 12688; AVX2-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] 12689; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12690; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload 12691; AVX2-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 12692; AVX2-NEXT: vextractf128 $1, %ymm13, %xmm1 12693; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2] 12694; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 12695; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] 12696; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12697; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12698; AVX2-NEXT: vbroadcastss 1528(%rdi), %ymm0 12699; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12700; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 12701; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12702; AVX2-NEXT: vunpckhps (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload 12703; AVX2-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 12704; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload 12705; AVX2-NEXT: # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] 12706; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12707; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload 12708; AVX2-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 12709; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm1 12710; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[2,2,2,2] 12711; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 12712; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] 12713; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12714; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12715; AVX2-NEXT: vbroadcastss 1784(%rdi), %ymm0 12716; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12717; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 12718; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12719; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 12720; AVX2-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 12721; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12722; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload 12723; AVX2-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 12724; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12725; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload 12726; AVX2-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 12727; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm1 12728; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[2,2,2,2] 12729; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 12730; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] 12731; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12732; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12733; AVX2-NEXT: vbroadcastss 2040(%rdi), %ymm0 12734; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 12735; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7] 12736; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12737; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 12738; AVX2-NEXT: # ymm4 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 12739; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12740; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 12741; AVX2-NEXT: # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 12742; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12743; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 12744; AVX2-NEXT: # ymm2 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 12745; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm0 12746; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[2,2,2,2] 12747; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] 12748; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] 12749; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 12750; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 12751; AVX2-NEXT: vbroadcastss 220(%rdi), %ymm0 12752; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12753; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 12754; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12755; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 12756; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12757; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 12758; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 12759; AVX2-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] 12760; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15 12761; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] 12762; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12763; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12764; AVX2-NEXT: vbroadcastss 476(%rdi), %ymm0 12765; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12766; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 12767; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12768; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 12769; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12770; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 12771; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 12772; AVX2-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] 12773; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15 12774; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] 12775; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12776; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12777; AVX2-NEXT: vbroadcastss 732(%rdi), %ymm0 12778; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12779; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 12780; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12781; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 12782; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12783; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 12784; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 12785; AVX2-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] 12786; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15 12787; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] 12788; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12789; AVX2-NEXT: vbroadcastss 988(%rdi), %ymm0 12790; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12791; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 12792; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12793; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 12794; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm1 12795; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload 12796; AVX2-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] 12797; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm14 12798; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] 12799; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12800; AVX2-NEXT: vbroadcastss 1244(%rdi), %ymm0 12801; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12802; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 12803; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] 12804; AVX2-NEXT: vextractf128 $1, %ymm11, %xmm1 12805; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,3,2,3,6,7,6,7] 12806; AVX2-NEXT: vextractf128 $1, %ymm11, %xmm11 12807; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] 12808; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12809; AVX2-NEXT: vbroadcastss 1500(%rdi), %ymm0 12810; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12811; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 12812; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] 12813; AVX2-NEXT: vextractf128 $1, %ymm8, %xmm8 12814; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] 12815; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm9 12816; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] 12817; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] 12818; AVX2-NEXT: vbroadcastss 1756(%rdi), %ymm0 12819; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12820; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 12821; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] 12822; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 12823; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,2,3,6,7,6,7] 12824; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 12825; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] 12826; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] 12827; AVX2-NEXT: vbroadcastss 2012(%rdi), %ymm0 12828; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12829; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 12830; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] 12831; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 12832; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] 12833; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3 12834; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] 12835; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 12836; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12837; AVX2-NEXT: vmovaps %ymm2, 192(%rsi) 12838; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12839; AVX2-NEXT: vmovaps %ymm2, 128(%rsi) 12840; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12841; AVX2-NEXT: vmovaps %ymm2, 64(%rsi) 12842; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12843; AVX2-NEXT: vmovaps %ymm2, (%rsi) 12844; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12845; AVX2-NEXT: vmovaps %ymm2, 224(%rsi) 12846; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12847; AVX2-NEXT: vmovaps %ymm2, 160(%rsi) 12848; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12849; AVX2-NEXT: vmovaps %ymm2, 96(%rsi) 12850; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12851; AVX2-NEXT: vmovaps %ymm2, 32(%rsi) 12852; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12853; AVX2-NEXT: vmovaps %ymm2, 192(%rdx) 12854; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12855; AVX2-NEXT: vmovaps %ymm2, 128(%rdx) 12856; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12857; AVX2-NEXT: vmovaps %ymm2, 64(%rdx) 12858; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12859; AVX2-NEXT: vmovaps %ymm2, (%rdx) 12860; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12861; AVX2-NEXT: vmovaps %ymm2, 224(%rdx) 12862; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12863; AVX2-NEXT: vmovaps %ymm2, 160(%rdx) 12864; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12865; AVX2-NEXT: vmovaps %ymm2, 96(%rdx) 12866; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12867; AVX2-NEXT: vmovaps %ymm2, 32(%rdx) 12868; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12869; AVX2-NEXT: vmovaps %ymm2, 192(%rcx) 12870; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12871; AVX2-NEXT: vmovaps %ymm2, 128(%rcx) 12872; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12873; AVX2-NEXT: vmovaps %ymm2, 64(%rcx) 12874; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12875; AVX2-NEXT: vmovaps %ymm2, (%rcx) 12876; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12877; AVX2-NEXT: vmovaps %ymm2, 224(%rcx) 12878; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12879; AVX2-NEXT: vmovaps %ymm2, 160(%rcx) 12880; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12881; AVX2-NEXT: vmovaps %ymm2, 96(%rcx) 12882; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12883; AVX2-NEXT: vmovaps %ymm2, 32(%rcx) 12884; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12885; AVX2-NEXT: vmovaps %ymm2, 192(%r8) 12886; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12887; AVX2-NEXT: vmovaps %ymm2, 128(%r8) 12888; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12889; AVX2-NEXT: vmovaps %ymm2, 64(%r8) 12890; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12891; AVX2-NEXT: vmovaps %ymm2, (%r8) 12892; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12893; AVX2-NEXT: vmovaps %ymm2, 224(%r8) 12894; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12895; AVX2-NEXT: vmovaps %ymm2, 160(%r8) 12896; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12897; AVX2-NEXT: vmovaps %ymm2, 96(%r8) 12898; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12899; AVX2-NEXT: vmovaps %ymm2, 32(%r8) 12900; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12901; AVX2-NEXT: vmovaps %ymm2, 224(%r9) 12902; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12903; AVX2-NEXT: vmovaps %ymm2, 192(%r9) 12904; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12905; AVX2-NEXT: vmovaps %ymm2, 160(%r9) 12906; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12907; AVX2-NEXT: vmovaps %ymm2, 128(%r9) 12908; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12909; AVX2-NEXT: vmovaps %ymm2, 96(%r9) 12910; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12911; AVX2-NEXT: vmovaps %ymm2, 64(%r9) 12912; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12913; AVX2-NEXT: vmovaps %ymm2, 32(%r9) 12914; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12915; AVX2-NEXT: vmovaps %ymm2, (%r9) 12916; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 12917; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12918; AVX2-NEXT: vmovaps %ymm2, 224(%rax) 12919; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12920; AVX2-NEXT: vmovaps %ymm2, 192(%rax) 12921; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12922; AVX2-NEXT: vmovaps %ymm2, 160(%rax) 12923; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12924; AVX2-NEXT: vmovaps %ymm2, 128(%rax) 12925; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12926; AVX2-NEXT: vmovaps %ymm2, 96(%rax) 12927; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12928; AVX2-NEXT: vmovaps %ymm2, 64(%rax) 12929; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12930; AVX2-NEXT: vmovaps %ymm2, 32(%rax) 12931; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12932; AVX2-NEXT: vmovaps %ymm2, (%rax) 12933; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 12934; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload 12935; AVX2-NEXT: vmovaps %ymm2, 224(%rax) 12936; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12937; AVX2-NEXT: vmovaps %ymm2, 192(%rax) 12938; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12939; AVX2-NEXT: vmovaps %ymm2, 160(%rax) 12940; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12941; AVX2-NEXT: vmovaps %ymm2, 128(%rax) 12942; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12943; AVX2-NEXT: vmovaps %ymm2, 96(%rax) 12944; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12945; AVX2-NEXT: vmovaps %ymm2, 64(%rax) 12946; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12947; AVX2-NEXT: vmovaps %ymm2, 32(%rax) 12948; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12949; AVX2-NEXT: vmovaps %ymm2, (%rax) 12950; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 12951; AVX2-NEXT: vmovaps %ymm0, 224(%rax) 12952; AVX2-NEXT: vmovaps %ymm5, 192(%rax) 12953; AVX2-NEXT: vmovaps %ymm8, 160(%rax) 12954; AVX2-NEXT: vmovaps %ymm1, 128(%rax) 12955; AVX2-NEXT: vmovaps %ymm14, 96(%rax) 12956; AVX2-NEXT: vmovaps %ymm15, 64(%rax) 12957; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12958; AVX2-NEXT: vmovaps %ymm0, 32(%rax) 12959; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12960; AVX2-NEXT: vmovaps %ymm0, (%rax) 12961; AVX2-NEXT: addq $3528, %rsp # imm = 0xDC8 12962; AVX2-NEXT: vzeroupper 12963; AVX2-NEXT: retq 12964; 12965; AVX2-FP-LABEL: load_i32_stride8_vf64: 12966; AVX2-FP: # %bb.0: 12967; AVX2-FP-NEXT: subq $3528, %rsp # imm = 0xDC8 12968; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm10 12969; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm0 12970; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12971; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] 12972; AVX2-FP-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12973; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm9 12974; AVX2-FP-NEXT: vbroadcastss %xmm9, %xmm1 12975; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm2 12976; AVX2-FP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill 12977; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2 12978; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 12979; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 12980; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm1 12981; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12982; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm2 12983; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12984; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 12985; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 12986; AVX2-FP-NEXT: vmovaps 480(%rdi), %xmm13 12987; AVX2-FP-NEXT: vbroadcastss %xmm13, %xmm2 12988; AVX2-FP-NEXT: vmovaps 448(%rdi), %xmm3 12989; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12990; AVX2-FP-NEXT: vbroadcastss %xmm3, %xmm3 12991; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 12992; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 12993; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] 12994; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 12995; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12996; AVX2-FP-NEXT: vmovaps 800(%rdi), %xmm0 12997; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12998; AVX2-FP-NEXT: vmovaps 768(%rdi), %xmm1 12999; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13000; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 13001; AVX2-FP-NEXT: vmovaps 864(%rdi), %xmm12 13002; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm1 13003; AVX2-FP-NEXT: vmovaps 832(%rdi), %xmm2 13004; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13005; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2 13006; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 13007; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 13008; AVX2-FP-NEXT: vmovaps 992(%rdi), %xmm1 13009; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13010; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1 13011; AVX2-FP-NEXT: vmovaps 960(%rdi), %xmm2 13012; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13013; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2 13014; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 13015; AVX2-FP-NEXT: vmovaps 928(%rdi), %xmm2 13016; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13017; AVX2-FP-NEXT: vmovaps 896(%rdi), %xmm3 13018; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13019; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 13020; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 13021; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13022; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 13023; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13024; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13025; AVX2-FP-NEXT: vmovaps 1376(%rdi), %xmm0 13026; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13027; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm0 13028; AVX2-FP-NEXT: vmovaps 1344(%rdi), %xmm1 13029; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13030; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1 13031; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 13032; AVX2-FP-NEXT: vmovaps 1312(%rdi), %xmm1 13033; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13034; AVX2-FP-NEXT: vmovaps 1280(%rdi), %xmm2 13035; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13036; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 13037; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 13038; AVX2-FP-NEXT: vmovaps 1504(%rdi), %xmm1 13039; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13040; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1 13041; AVX2-FP-NEXT: vmovaps 1472(%rdi), %xmm2 13042; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13043; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2 13044; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 13045; AVX2-FP-NEXT: vmovaps 1440(%rdi), %xmm2 13046; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13047; AVX2-FP-NEXT: vmovaps 1408(%rdi), %xmm3 13048; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13049; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 13050; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 13051; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13052; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 13053; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13054; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13055; AVX2-FP-NEXT: vmovaps 1888(%rdi), %xmm0 13056; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13057; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm0 13058; AVX2-FP-NEXT: vmovaps 1856(%rdi), %xmm1 13059; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13060; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1 13061; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 13062; AVX2-FP-NEXT: vmovaps 1824(%rdi), %xmm1 13063; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13064; AVX2-FP-NEXT: vmovaps 1792(%rdi), %xmm2 13065; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13066; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 13067; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] 13068; AVX2-FP-NEXT: vmovaps 2016(%rdi), %xmm0 13069; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13070; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm1 13071; AVX2-FP-NEXT: vmovaps 1984(%rdi), %xmm0 13072; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13073; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm2 13074; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 13075; AVX2-FP-NEXT: vmovaps 1952(%rdi), %xmm0 13076; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13077; AVX2-FP-NEXT: vmovaps 1920(%rdi), %xmm2 13078; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13079; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 13080; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 13081; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13082; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 13083; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] 13084; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13085; AVX2-FP-NEXT: vmovaps 608(%rdi), %xmm0 13086; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13087; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm0 13088; AVX2-FP-NEXT: vmovaps 576(%rdi), %xmm1 13089; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13090; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1 13091; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 13092; AVX2-FP-NEXT: vmovaps 544(%rdi), %xmm2 13093; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13094; AVX2-FP-NEXT: vmovaps 512(%rdi), %xmm1 13095; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13096; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 13097; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 13098; AVX2-FP-NEXT: vmovaps 736(%rdi), %xmm1 13099; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13100; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1 13101; AVX2-FP-NEXT: vmovaps 704(%rdi), %xmm2 13102; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13103; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2 13104; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 13105; AVX2-FP-NEXT: vmovaps 672(%rdi), %xmm3 13106; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13107; AVX2-FP-NEXT: vmovaps 640(%rdi), %xmm2 13108; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13109; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 13110; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 13111; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13112; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 13113; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13114; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13115; AVX2-FP-NEXT: vmovaps 1120(%rdi), %xmm0 13116; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13117; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm0 13118; AVX2-FP-NEXT: vmovaps 1088(%rdi), %xmm1 13119; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13120; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1 13121; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 13122; AVX2-FP-NEXT: vmovaps 1056(%rdi), %xmm2 13123; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13124; AVX2-FP-NEXT: vmovaps 1024(%rdi), %xmm1 13125; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13126; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 13127; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 13128; AVX2-FP-NEXT: vmovaps 1248(%rdi), %xmm1 13129; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13130; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1 13131; AVX2-FP-NEXT: vmovaps 1216(%rdi), %xmm2 13132; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13133; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2 13134; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 13135; AVX2-FP-NEXT: vmovaps 1184(%rdi), %xmm3 13136; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13137; AVX2-FP-NEXT: vmovaps 1152(%rdi), %xmm2 13138; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13139; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 13140; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 13141; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13142; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 13143; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13144; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13145; AVX2-FP-NEXT: vmovaps 1632(%rdi), %xmm0 13146; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13147; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm0 13148; AVX2-FP-NEXT: vmovaps 1600(%rdi), %xmm1 13149; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13150; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1 13151; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 13152; AVX2-FP-NEXT: vmovaps 1568(%rdi), %xmm2 13153; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13154; AVX2-FP-NEXT: vmovaps 1536(%rdi), %xmm1 13155; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13156; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 13157; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 13158; AVX2-FP-NEXT: vmovaps 1760(%rdi), %xmm1 13159; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13160; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1 13161; AVX2-FP-NEXT: vmovaps 1728(%rdi), %xmm2 13162; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13163; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2 13164; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 13165; AVX2-FP-NEXT: vmovaps 1696(%rdi), %xmm3 13166; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13167; AVX2-FP-NEXT: vmovaps 1664(%rdi), %xmm2 13168; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13169; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 13170; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 13171; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13172; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 13173; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13174; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13175; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm0 13176; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13177; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm0 13178; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm11 13179; AVX2-FP-NEXT: vbroadcastss %xmm11, %xmm1 13180; AVX2-FP-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13181; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 13182; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm2 13183; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13184; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm1 13185; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13186; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 13187; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13188; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 13189; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7] 13190; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm8 13191; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm1 13192; AVX2-FP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13193; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm7 13194; AVX2-FP-NEXT: vbroadcastss %xmm7, %xmm2 13195; AVX2-FP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13196; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 13197; AVX2-FP-NEXT: vmovaps (%rdi), %xmm5 13198; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm6 13199; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 13200; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13201; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13202; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3] 13203; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7] 13204; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13205; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 13206; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1] 13207; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] 13208; AVX2-FP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13209; AVX2-FP-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 13210; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] 13211; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 13212; AVX2-FP-NEXT: vmovaps %xmm13, %xmm9 13213; AVX2-FP-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13214; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 13215; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] 13216; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13217; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 13218; AVX2-FP-NEXT: # xmm2 = mem[1,1,1,1] 13219; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 13220; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 13221; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 13222; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 13223; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13224; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13225; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 13226; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1] 13227; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 13228; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 13229; AVX2-FP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13230; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 13231; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] 13232; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 13233; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 13234; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 13235; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] 13236; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13237; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 13238; AVX2-FP-NEXT: # xmm2 = mem[1,1,1,1] 13239; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 13240; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 13241; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 13242; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 13243; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13244; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13245; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 13246; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1] 13247; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 13248; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 13249; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 13250; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 13251; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 13252; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 13253; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 13254; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 13255; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 13256; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13257; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 13258; AVX2-FP-NEXT: # xmm2 = mem[1,1,1,1] 13259; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 13260; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 13261; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 13262; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 13263; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13264; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13265; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 13266; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1] 13267; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 13268; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 13269; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 13270; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 13271; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 13272; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 13273; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 13274; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 13275; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 13276; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13277; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 13278; AVX2-FP-NEXT: # xmm2 = mem[1,1,1,1] 13279; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 13280; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 13281; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 13282; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 13283; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13284; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13285; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] 13286; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] 13287; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] 13288; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 13289; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload 13290; AVX2-FP-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] 13291; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13292; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 13293; AVX2-FP-NEXT: # xmm2 = mem[1,1,1,1] 13294; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 13295; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 13296; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 13297; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 13298; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13299; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13300; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 13301; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1] 13302; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 13303; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 13304; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 13305; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 13306; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 13307; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 13308; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 13309; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 13310; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 13311; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13312; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 13313; AVX2-FP-NEXT: # xmm2 = mem[1,1,1,1] 13314; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 13315; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 13316; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 13317; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 13318; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13319; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13320; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 13321; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1] 13322; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 13323; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 13324; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 13325; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 13326; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 13327; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 13328; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 13329; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 13330; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] 13331; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13332; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 13333; AVX2-FP-NEXT: # xmm2 = mem[1,1,1,1] 13334; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 13335; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 13336; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 13337; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 13338; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13339; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13340; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 13341; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1] 13342; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 13343; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 13344; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 13345; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 13346; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 13347; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 13348; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 13349; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 13350; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 13351; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13352; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 13353; AVX2-FP-NEXT: # xmm2 = mem[1,1,1,1] 13354; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 13355; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 13356; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 13357; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 13358; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13359; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13360; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13361; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload 13362; AVX2-FP-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] 13363; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13364; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13365; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 13366; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 13367; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13368; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2] 13369; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] 13370; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 13371; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13372; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 13373; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 13374; AVX2-FP-NEXT: # xmm1 = mem[2,2,2,2] 13375; AVX2-FP-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload 13376; AVX2-FP-NEXT: # xmm1 = mem[0,1,2],xmm1[3] 13377; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] 13378; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 13379; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13380; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13381; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload 13382; AVX2-FP-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] 13383; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13384; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13385; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 13386; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 13387; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13388; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2] 13389; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] 13390; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 13391; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13392; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 13393; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 13394; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2] 13395; AVX2-FP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 13396; AVX2-FP-NEXT: # xmm1 = mem[0,1,2],xmm1[3] 13397; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 13398; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 13399; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13400; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13401; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload 13402; AVX2-FP-NEXT: # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3] 13403; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13404; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 13405; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 13406; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13407; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2] 13408; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] 13409; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 13410; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13411; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 13412; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 13413; AVX2-FP-NEXT: # xmm1 = mem[2,2,2,2] 13414; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 13415; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] 13416; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] 13417; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 13418; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13419; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13420; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload 13421; AVX2-FP-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] 13422; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13423; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 13424; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 13425; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13426; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 13427; AVX2-FP-NEXT: # xmm0 = mem[2,2,2,2] 13428; AVX2-FP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 13429; AVX2-FP-NEXT: # xmm0 = mem[0,1,2],xmm0[3] 13430; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 13431; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13432; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 13433; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 13434; AVX2-FP-NEXT: # xmm1 = mem[2,2,2,2] 13435; AVX2-FP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 13436; AVX2-FP-NEXT: # xmm1 = mem[0,1,2],xmm1[3] 13437; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] 13438; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 13439; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13440; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13441; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload 13442; AVX2-FP-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] 13443; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13444; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13445; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 13446; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 13447; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13448; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,2,2,2] 13449; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] 13450; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 13451; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13452; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 13453; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 13454; AVX2-FP-NEXT: # xmm1 = mem[2,2,2,2] 13455; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 13456; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] 13457; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 13458; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 13459; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13460; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13461; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload 13462; AVX2-FP-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] 13463; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13464; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13465; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 13466; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 13467; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13468; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] 13469; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] 13470; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 13471; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13472; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 13473; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 13474; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] 13475; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 13476; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] 13477; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 13478; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 13479; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13480; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13481; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload 13482; AVX2-FP-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3] 13483; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13484; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13485; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 13486; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 13487; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13488; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 13489; AVX2-FP-NEXT: # xmm0 = mem[2,2,2,2] 13490; AVX2-FP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 13491; AVX2-FP-NEXT: # xmm0 = mem[0,1,2],xmm0[3] 13492; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 13493; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15 13494; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] 13495; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 13496; AVX2-FP-NEXT: vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2] 13497; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 13498; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] 13499; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm5[0,1],xmm15[2,3] 13500; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] 13501; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13502; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13503; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload 13504; AVX2-FP-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3] 13505; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13506; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 13507; AVX2-FP-NEXT: # xmm15 = mem[2,2,2,2] 13508; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 13509; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3] 13510; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 13511; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm14 13512; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 13513; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 13514; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload 13515; AVX2-FP-NEXT: # xmm15 = xmm5[2],mem[2],xmm5[3],mem[3] 13516; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 13517; AVX2-FP-NEXT: # xmm13 = mem[2,2,2,2] 13518; AVX2-FP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 13519; AVX2-FP-NEXT: # xmm13 = mem[0,1,2],xmm13[3] 13520; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] 13521; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm14[4,5,6,7] 13522; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13523; AVX2-FP-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload 13524; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload 13525; AVX2-FP-NEXT: # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3] 13526; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 13527; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm5[1],xmm13[1] 13528; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 13529; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload 13530; AVX2-FP-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3] 13531; AVX2-FP-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 13532; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 13533; AVX2-FP-NEXT: # xmm12 = mem[2,3,2,3] 13534; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 13535; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] 13536; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] 13537; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13538; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 13539; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] 13540; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 13541; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm12[1] 13542; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 13543; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload 13544; AVX2-FP-NEXT: # xmm12 = xmm5[2],mem[2],xmm5[3],mem[3] 13545; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 13546; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 13547; AVX2-FP-NEXT: # xmm13 = mem[2,3,2,3] 13548; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 13549; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 13550; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] 13551; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13552; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm11 # 16-byte Folded Reload 13553; AVX2-FP-NEXT: # xmm11 = xmm7[2],mem[2],xmm7[3],mem[3] 13554; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm11[1] 13555; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 13556; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload 13557; AVX2-FP-NEXT: # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3] 13558; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 13559; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 13560; AVX2-FP-NEXT: # xmm10 = mem[2,3,2,3] 13561; AVX2-FP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 13562; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] 13563; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] 13564; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13565; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 13566; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload 13567; AVX2-FP-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3] 13568; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm9[1] 13569; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 13570; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload 13571; AVX2-FP-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3] 13572; AVX2-FP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 13573; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 13574; AVX2-FP-NEXT: # xmm8 = mem[2,3,2,3] 13575; AVX2-FP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 13576; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] 13577; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 13578; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13579; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload 13580; AVX2-FP-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] 13581; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 13582; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] 13583; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 13584; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload 13585; AVX2-FP-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] 13586; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 13587; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 13588; AVX2-FP-NEXT: # xmm6 = mem[2,3,2,3] 13589; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 13590; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] 13591; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] 13592; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13593; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] 13594; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 13595; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] 13596; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 13597; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload 13598; AVX2-FP-NEXT: # xmm5 = xmm4[2],mem[2],xmm4[3],mem[3] 13599; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 13600; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 13601; AVX2-FP-NEXT: # xmm4 = mem[2,3,2,3] 13602; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 13603; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 13604; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] 13605; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13606; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 13607; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 13608; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 13609; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 13610; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload 13611; AVX2-FP-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] 13612; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 13613; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 13614; AVX2-FP-NEXT: # xmm2 = mem[2,3,2,3] 13615; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 13616; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 13617; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 13618; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13619; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 13620; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 13621; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13622; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 13623; AVX2-FP-NEXT: # xmm0 = mem[2,3,2,3] 13624; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 13625; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 13626; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 13627; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 13628; AVX2-FP-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] 13629; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] 13630; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 13631; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13632; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0 13633; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13634; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1 13635; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13636; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 13637; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 13638; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm1 13639; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13640; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm2 13641; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13642; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 13643; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13644; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 13645; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 13646; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm2 13647; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13648; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm3 13649; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13650; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm15 13651; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm1 13652; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13653; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] 13654; AVX2-FP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13655; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 13656; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13657; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] 13658; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13659; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13660; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm0 13661; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13662; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm1 13663; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13664; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 13665; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 13666; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm1 13667; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13668; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm2 13669; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13670; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 13671; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13672; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 13673; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 13674; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm2 13675; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13676; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm3 13677; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13678; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm4 13679; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13680; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm1 13681; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13682; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] 13683; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 13684; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] 13685; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13686; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13687; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm0 13688; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13689; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm1 13690; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13691; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 13692; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 13693; AVX2-FP-NEXT: vmovaps 608(%rdi), %ymm1 13694; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13695; AVX2-FP-NEXT: vmovaps 576(%rdi), %ymm2 13696; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13697; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 13698; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13699; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 13700; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 13701; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm2 13702; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13703; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm3 13704; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13705; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm4 13706; AVX2-FP-NEXT: vmovaps 640(%rdi), %ymm1 13707; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13708; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] 13709; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13710; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 13711; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] 13712; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13713; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13714; AVX2-FP-NEXT: vmovaps 800(%rdi), %ymm0 13715; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13716; AVX2-FP-NEXT: vmovaps 768(%rdi), %ymm1 13717; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13718; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 13719; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 13720; AVX2-FP-NEXT: vmovaps 864(%rdi), %ymm1 13721; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13722; AVX2-FP-NEXT: vmovaps 832(%rdi), %ymm2 13723; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13724; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 13725; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13726; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 13727; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 13728; AVX2-FP-NEXT: vmovaps 992(%rdi), %ymm2 13729; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13730; AVX2-FP-NEXT: vmovaps 960(%rdi), %ymm5 13731; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13732; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm3 13733; AVX2-FP-NEXT: vmovaps 896(%rdi), %ymm1 13734; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13735; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] 13736; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13737; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] 13738; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] 13739; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13740; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13741; AVX2-FP-NEXT: vmovaps 1056(%rdi), %ymm0 13742; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13743; AVX2-FP-NEXT: vmovaps 1024(%rdi), %ymm1 13744; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13745; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 13746; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 13747; AVX2-FP-NEXT: vmovaps 1120(%rdi), %ymm1 13748; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13749; AVX2-FP-NEXT: vmovaps 1088(%rdi), %ymm2 13750; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13751; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 13752; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13753; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 13754; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 13755; AVX2-FP-NEXT: vmovaps 1248(%rdi), %ymm1 13756; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13757; AVX2-FP-NEXT: vmovaps 1216(%rdi), %ymm7 13758; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13759; AVX2-FP-NEXT: vmovaps 1184(%rdi), %ymm2 13760; AVX2-FP-NEXT: vmovaps 1152(%rdi), %ymm6 13761; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13762; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] 13763; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13764; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[4],ymm1[4],ymm7[5],ymm1[5] 13765; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] 13766; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] 13767; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13768; AVX2-FP-NEXT: vmovaps 1312(%rdi), %ymm0 13769; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13770; AVX2-FP-NEXT: vmovaps 1280(%rdi), %ymm6 13771; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13772; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] 13773; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm6 13774; AVX2-FP-NEXT: vmovaps 1376(%rdi), %ymm0 13775; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13776; AVX2-FP-NEXT: vmovaps 1344(%rdi), %ymm7 13777; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13778; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5] 13779; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13780; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2] 13781; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] 13782; AVX2-FP-NEXT: vmovaps 1504(%rdi), %ymm6 13783; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13784; AVX2-FP-NEXT: vmovaps 1472(%rdi), %ymm10 13785; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13786; AVX2-FP-NEXT: vmovaps 1440(%rdi), %ymm0 13787; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 13788; AVX2-FP-NEXT: vmovaps 1408(%rdi), %ymm9 13789; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13790; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] 13791; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] 13792; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] 13793; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] 13794; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13795; AVX2-FP-NEXT: vmovaps 1568(%rdi), %ymm6 13796; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13797; AVX2-FP-NEXT: vmovaps 1536(%rdi), %ymm7 13798; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13799; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] 13800; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm9 13801; AVX2-FP-NEXT: vmovaps 1632(%rdi), %ymm6 13802; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13803; AVX2-FP-NEXT: vmovaps 1600(%rdi), %ymm7 13804; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13805; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] 13806; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13807; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,2,2,2] 13808; AVX2-FP-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0,1],xmm10[2,3] 13809; AVX2-FP-NEXT: vmovaps 1760(%rdi), %ymm9 13810; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13811; AVX2-FP-NEXT: vmovaps 1728(%rdi), %ymm6 13812; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13813; AVX2-FP-NEXT: vmovaps 1696(%rdi), %ymm7 13814; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13815; AVX2-FP-NEXT: vmovaps 1664(%rdi), %ymm11 13816; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13817; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5] 13818; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] 13819; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13820; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[2],ymm6[2] 13821; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] 13822; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13823; AVX2-FP-NEXT: vmovaps 1824(%rdi), %ymm6 13824; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13825; AVX2-FP-NEXT: vmovaps 1792(%rdi), %ymm7 13826; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13827; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] 13828; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm11 13829; AVX2-FP-NEXT: vmovaps 1888(%rdi), %ymm6 13830; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13831; AVX2-FP-NEXT: vmovaps 1856(%rdi), %ymm7 13832; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13833; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] 13834; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13835; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[2,2,2,2] 13836; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm13[2,3] 13837; AVX2-FP-NEXT: vmovaps 2016(%rdi), %ymm11 13838; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13839; AVX2-FP-NEXT: vmovaps 1984(%rdi), %ymm6 13840; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13841; AVX2-FP-NEXT: vmovaps 1952(%rdi), %ymm7 13842; AVX2-FP-NEXT: vmovaps 1920(%rdi), %ymm9 13843; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13844; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] 13845; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13846; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5] 13847; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] 13848; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] 13849; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13850; AVX2-FP-NEXT: vbroadcastss 148(%rdi), %ymm13 13851; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] 13852; AVX2-FP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload 13853; AVX2-FP-NEXT: # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] 13854; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 13855; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm14 13856; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 13857; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] 13858; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 13859; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7] 13860; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15 13861; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] 13862; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] 13863; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13864; AVX2-FP-NEXT: vbroadcastss 404(%rdi), %ymm13 13865; AVX2-FP-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload 13866; AVX2-FP-NEXT: # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7] 13867; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 13868; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 13869; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm13 13870; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 13871; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5] 13872; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 13873; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] 13874; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm14 13875; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] 13876; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] 13877; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13878; AVX2-FP-NEXT: vbroadcastss 660(%rdi), %ymm12 13879; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7] 13880; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7] 13881; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 13882; AVX2-FP-NEXT: vextractf128 $1, %ymm8, %xmm8 13883; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 13884; AVX2-FP-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5] 13885; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 13886; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] 13887; AVX2-FP-NEXT: vextractf128 $1, %ymm12, %xmm12 13888; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] 13889; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] 13890; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13891; AVX2-FP-NEXT: vbroadcastss 916(%rdi), %ymm4 13892; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] 13893; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7] 13894; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 13895; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4 13896; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 13897; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] 13898; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 13899; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7] 13900; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 13901; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] 13902; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 13903; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13904; AVX2-FP-NEXT: vbroadcastss 1172(%rdi), %ymm3 13905; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] 13906; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] 13907; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 13908; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 13909; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 13910; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5] 13911; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13912; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] 13913; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm3 13914; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] 13915; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 13916; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13917; AVX2-FP-NEXT: vbroadcastss 1428(%rdi), %ymm1 13918; AVX2-FP-NEXT: vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload 13919; AVX2-FP-NEXT: # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7] 13920; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] 13921; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 13922; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 13923; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 13924; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] 13925; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 13926; AVX2-FP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] 13927; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 13928; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] 13929; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 13930; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13931; AVX2-FP-NEXT: vbroadcastss 1684(%rdi), %ymm0 13932; AVX2-FP-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 13933; AVX2-FP-NEXT: # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7] 13934; AVX2-FP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 13935; AVX2-FP-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 13936; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13937; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 13938; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 13939; AVX2-FP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] 13940; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 13941; AVX2-FP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] 13942; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 13943; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 13944; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 13945; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13946; AVX2-FP-NEXT: vbroadcastss 1940(%rdi), %ymm0 13947; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] 13948; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] 13949; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13950; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 13951; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 13952; AVX2-FP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] 13953; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 13954; AVX2-FP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] 13955; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 13956; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 13957; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 13958; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13959; AVX2-FP-NEXT: vbroadcastss 248(%rdi), %ymm0 13960; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 13961; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 13962; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13963; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload 13964; AVX2-FP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 13965; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13966; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] 13967; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13968; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13969; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 13970; AVX2-FP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 13971; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13972; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm1 13973; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2] 13974; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 13975; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] 13976; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 13977; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13978; AVX2-FP-NEXT: vbroadcastss 504(%rdi), %ymm0 13979; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 13980; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 13981; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13982; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 13983; AVX2-FP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 13984; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13985; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[6],ymm10[6],ymm15[7],ymm10[7] 13986; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13987; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13988; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload 13989; AVX2-FP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 13990; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13991; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm1 13992; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] 13993; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 13994; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] 13995; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 13996; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13997; AVX2-FP-NEXT: vbroadcastss 760(%rdi), %ymm0 13998; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 13999; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 14000; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14001; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 14002; AVX2-FP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 14003; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14004; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] 14005; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14006; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14007; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload 14008; AVX2-FP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 14009; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14010; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm1 14011; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] 14012; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 14013; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] 14014; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14015; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14016; AVX2-FP-NEXT: vbroadcastss 1016(%rdi), %ymm0 14017; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 14018; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 14019; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14020; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 14021; AVX2-FP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 14022; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14023; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7] 14024; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14025; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14026; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload 14027; AVX2-FP-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 14028; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm1 14029; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm14[2,2,2,2] 14030; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 14031; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] 14032; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14033; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14034; AVX2-FP-NEXT: vbroadcastss 1272(%rdi), %ymm0 14035; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 14036; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 14037; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14038; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload 14039; AVX2-FP-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 14040; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] 14041; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14042; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload 14043; AVX2-FP-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 14044; AVX2-FP-NEXT: vextractf128 $1, %ymm13, %xmm1 14045; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2] 14046; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 14047; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] 14048; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14049; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14050; AVX2-FP-NEXT: vbroadcastss 1528(%rdi), %ymm0 14051; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 14052; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 14053; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14054; AVX2-FP-NEXT: vunpckhps (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload 14055; AVX2-FP-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 14056; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload 14057; AVX2-FP-NEXT: # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] 14058; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14059; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload 14060; AVX2-FP-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 14061; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm1 14062; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[2,2,2,2] 14063; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 14064; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] 14065; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14066; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14067; AVX2-FP-NEXT: vbroadcastss 1784(%rdi), %ymm0 14068; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 14069; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 14070; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14071; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 14072; AVX2-FP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 14073; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14074; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload 14075; AVX2-FP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 14076; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14077; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload 14078; AVX2-FP-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 14079; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm1 14080; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[2,2,2,2] 14081; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 14082; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] 14083; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14084; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14085; AVX2-FP-NEXT: vbroadcastss 2040(%rdi), %ymm0 14086; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 14087; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7] 14088; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14089; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 14090; AVX2-FP-NEXT: # ymm4 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 14091; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14092; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 14093; AVX2-FP-NEXT: # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 14094; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14095; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 14096; AVX2-FP-NEXT: # ymm2 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 14097; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm0 14098; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[2,2,2,2] 14099; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] 14100; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] 14101; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14102; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 14103; AVX2-FP-NEXT: vbroadcastss 220(%rdi), %ymm0 14104; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 14105; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 14106; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14107; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 14108; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14109; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 14110; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 14111; AVX2-FP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] 14112; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15 14113; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] 14114; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14115; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14116; AVX2-FP-NEXT: vbroadcastss 476(%rdi), %ymm0 14117; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 14118; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 14119; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14120; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 14121; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14122; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 14123; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 14124; AVX2-FP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] 14125; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15 14126; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] 14127; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14128; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14129; AVX2-FP-NEXT: vbroadcastss 732(%rdi), %ymm0 14130; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 14131; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 14132; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14133; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 14134; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14135; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 14136; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 14137; AVX2-FP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] 14138; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15 14139; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] 14140; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14141; AVX2-FP-NEXT: vbroadcastss 988(%rdi), %ymm0 14142; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 14143; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 14144; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 14145; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 14146; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm1 14147; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload 14148; AVX2-FP-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] 14149; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm14 14150; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] 14151; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14152; AVX2-FP-NEXT: vbroadcastss 1244(%rdi), %ymm0 14153; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 14154; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 14155; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] 14156; AVX2-FP-NEXT: vextractf128 $1, %ymm11, %xmm1 14157; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,3,2,3,6,7,6,7] 14158; AVX2-FP-NEXT: vextractf128 $1, %ymm11, %xmm11 14159; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] 14160; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14161; AVX2-FP-NEXT: vbroadcastss 1500(%rdi), %ymm0 14162; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 14163; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 14164; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] 14165; AVX2-FP-NEXT: vextractf128 $1, %ymm8, %xmm8 14166; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] 14167; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm9 14168; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] 14169; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] 14170; AVX2-FP-NEXT: vbroadcastss 1756(%rdi), %ymm0 14171; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 14172; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 14173; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] 14174; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 14175; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,2,3,6,7,6,7] 14176; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 14177; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] 14178; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] 14179; AVX2-FP-NEXT: vbroadcastss 2012(%rdi), %ymm0 14180; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 14181; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 14182; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] 14183; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 14184; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] 14185; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm3 14186; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] 14187; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 14188; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14189; AVX2-FP-NEXT: vmovaps %ymm2, 192(%rsi) 14190; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14191; AVX2-FP-NEXT: vmovaps %ymm2, 128(%rsi) 14192; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14193; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rsi) 14194; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14195; AVX2-FP-NEXT: vmovaps %ymm2, (%rsi) 14196; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14197; AVX2-FP-NEXT: vmovaps %ymm2, 224(%rsi) 14198; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14199; AVX2-FP-NEXT: vmovaps %ymm2, 160(%rsi) 14200; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14201; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rsi) 14202; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14203; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rsi) 14204; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14205; AVX2-FP-NEXT: vmovaps %ymm2, 192(%rdx) 14206; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14207; AVX2-FP-NEXT: vmovaps %ymm2, 128(%rdx) 14208; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14209; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rdx) 14210; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14211; AVX2-FP-NEXT: vmovaps %ymm2, (%rdx) 14212; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14213; AVX2-FP-NEXT: vmovaps %ymm2, 224(%rdx) 14214; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14215; AVX2-FP-NEXT: vmovaps %ymm2, 160(%rdx) 14216; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14217; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rdx) 14218; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14219; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx) 14220; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14221; AVX2-FP-NEXT: vmovaps %ymm2, 192(%rcx) 14222; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14223; AVX2-FP-NEXT: vmovaps %ymm2, 128(%rcx) 14224; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14225; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rcx) 14226; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14227; AVX2-FP-NEXT: vmovaps %ymm2, (%rcx) 14228; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14229; AVX2-FP-NEXT: vmovaps %ymm2, 224(%rcx) 14230; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14231; AVX2-FP-NEXT: vmovaps %ymm2, 160(%rcx) 14232; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14233; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rcx) 14234; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14235; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rcx) 14236; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14237; AVX2-FP-NEXT: vmovaps %ymm2, 192(%r8) 14238; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14239; AVX2-FP-NEXT: vmovaps %ymm2, 128(%r8) 14240; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14241; AVX2-FP-NEXT: vmovaps %ymm2, 64(%r8) 14242; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14243; AVX2-FP-NEXT: vmovaps %ymm2, (%r8) 14244; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14245; AVX2-FP-NEXT: vmovaps %ymm2, 224(%r8) 14246; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14247; AVX2-FP-NEXT: vmovaps %ymm2, 160(%r8) 14248; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14249; AVX2-FP-NEXT: vmovaps %ymm2, 96(%r8) 14250; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14251; AVX2-FP-NEXT: vmovaps %ymm2, 32(%r8) 14252; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14253; AVX2-FP-NEXT: vmovaps %ymm2, 224(%r9) 14254; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14255; AVX2-FP-NEXT: vmovaps %ymm2, 192(%r9) 14256; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14257; AVX2-FP-NEXT: vmovaps %ymm2, 160(%r9) 14258; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14259; AVX2-FP-NEXT: vmovaps %ymm2, 128(%r9) 14260; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14261; AVX2-FP-NEXT: vmovaps %ymm2, 96(%r9) 14262; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14263; AVX2-FP-NEXT: vmovaps %ymm2, 64(%r9) 14264; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14265; AVX2-FP-NEXT: vmovaps %ymm2, 32(%r9) 14266; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14267; AVX2-FP-NEXT: vmovaps %ymm2, (%r9) 14268; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 14269; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14270; AVX2-FP-NEXT: vmovaps %ymm2, 224(%rax) 14271; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14272; AVX2-FP-NEXT: vmovaps %ymm2, 192(%rax) 14273; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14274; AVX2-FP-NEXT: vmovaps %ymm2, 160(%rax) 14275; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14276; AVX2-FP-NEXT: vmovaps %ymm2, 128(%rax) 14277; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14278; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rax) 14279; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14280; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rax) 14281; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14282; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rax) 14283; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14284; AVX2-FP-NEXT: vmovaps %ymm2, (%rax) 14285; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 14286; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload 14287; AVX2-FP-NEXT: vmovaps %ymm2, 224(%rax) 14288; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14289; AVX2-FP-NEXT: vmovaps %ymm2, 192(%rax) 14290; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14291; AVX2-FP-NEXT: vmovaps %ymm2, 160(%rax) 14292; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14293; AVX2-FP-NEXT: vmovaps %ymm2, 128(%rax) 14294; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14295; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rax) 14296; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14297; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rax) 14298; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14299; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rax) 14300; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 14301; AVX2-FP-NEXT: vmovaps %ymm2, (%rax) 14302; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 14303; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax) 14304; AVX2-FP-NEXT: vmovaps %ymm5, 192(%rax) 14305; AVX2-FP-NEXT: vmovaps %ymm8, 160(%rax) 14306; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rax) 14307; AVX2-FP-NEXT: vmovaps %ymm14, 96(%rax) 14308; AVX2-FP-NEXT: vmovaps %ymm15, 64(%rax) 14309; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14310; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) 14311; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 14312; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) 14313; AVX2-FP-NEXT: addq $3528, %rsp # imm = 0xDC8 14314; AVX2-FP-NEXT: vzeroupper 14315; AVX2-FP-NEXT: retq 14316; 14317; AVX2-FCP-LABEL: load_i32_stride8_vf64: 14318; AVX2-FCP: # %bb.0: 14319; AVX2-FCP-NEXT: subq $3528, %rsp # imm = 0xDC8 14320; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm10 14321; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm0 14322; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14323; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] 14324; AVX2-FCP-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14325; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm9 14326; AVX2-FCP-NEXT: vbroadcastss %xmm9, %xmm1 14327; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm2 14328; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill 14329; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2 14330; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 14331; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 14332; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm1 14333; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14334; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm2 14335; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14336; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 14337; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14338; AVX2-FCP-NEXT: vmovaps 480(%rdi), %xmm13 14339; AVX2-FCP-NEXT: vbroadcastss %xmm13, %xmm2 14340; AVX2-FCP-NEXT: vmovaps 448(%rdi), %xmm3 14341; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14342; AVX2-FCP-NEXT: vbroadcastss %xmm3, %xmm3 14343; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 14344; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 14345; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] 14346; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14347; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14348; AVX2-FCP-NEXT: vmovaps 800(%rdi), %xmm0 14349; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14350; AVX2-FCP-NEXT: vmovaps 768(%rdi), %xmm1 14351; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14352; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 14353; AVX2-FCP-NEXT: vmovaps 864(%rdi), %xmm12 14354; AVX2-FCP-NEXT: vbroadcastss %xmm12, %xmm1 14355; AVX2-FCP-NEXT: vmovaps 832(%rdi), %xmm2 14356; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14357; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2 14358; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 14359; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 14360; AVX2-FCP-NEXT: vmovaps 992(%rdi), %xmm1 14361; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14362; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1 14363; AVX2-FCP-NEXT: vmovaps 960(%rdi), %xmm2 14364; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14365; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2 14366; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 14367; AVX2-FCP-NEXT: vmovaps 928(%rdi), %xmm2 14368; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14369; AVX2-FCP-NEXT: vmovaps 896(%rdi), %xmm3 14370; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14371; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 14372; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 14373; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14374; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 14375; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14376; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14377; AVX2-FCP-NEXT: vmovaps 1376(%rdi), %xmm0 14378; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14379; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm0 14380; AVX2-FCP-NEXT: vmovaps 1344(%rdi), %xmm1 14381; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14382; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1 14383; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 14384; AVX2-FCP-NEXT: vmovaps 1312(%rdi), %xmm1 14385; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14386; AVX2-FCP-NEXT: vmovaps 1280(%rdi), %xmm2 14387; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14388; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 14389; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 14390; AVX2-FCP-NEXT: vmovaps 1504(%rdi), %xmm1 14391; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14392; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1 14393; AVX2-FCP-NEXT: vmovaps 1472(%rdi), %xmm2 14394; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14395; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2 14396; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 14397; AVX2-FCP-NEXT: vmovaps 1440(%rdi), %xmm2 14398; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14399; AVX2-FCP-NEXT: vmovaps 1408(%rdi), %xmm3 14400; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14401; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 14402; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 14403; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14404; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 14405; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14406; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14407; AVX2-FCP-NEXT: vmovaps 1888(%rdi), %xmm0 14408; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14409; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm0 14410; AVX2-FCP-NEXT: vmovaps 1856(%rdi), %xmm1 14411; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14412; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1 14413; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 14414; AVX2-FCP-NEXT: vmovaps 1824(%rdi), %xmm1 14415; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14416; AVX2-FCP-NEXT: vmovaps 1792(%rdi), %xmm2 14417; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14418; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 14419; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] 14420; AVX2-FCP-NEXT: vmovaps 2016(%rdi), %xmm0 14421; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14422; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm1 14423; AVX2-FCP-NEXT: vmovaps 1984(%rdi), %xmm0 14424; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14425; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm2 14426; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 14427; AVX2-FCP-NEXT: vmovaps 1952(%rdi), %xmm0 14428; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14429; AVX2-FCP-NEXT: vmovaps 1920(%rdi), %xmm2 14430; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14431; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 14432; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 14433; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14434; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 14435; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] 14436; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14437; AVX2-FCP-NEXT: vmovaps 608(%rdi), %xmm0 14438; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14439; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm0 14440; AVX2-FCP-NEXT: vmovaps 576(%rdi), %xmm1 14441; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14442; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1 14443; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 14444; AVX2-FCP-NEXT: vmovaps 544(%rdi), %xmm2 14445; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14446; AVX2-FCP-NEXT: vmovaps 512(%rdi), %xmm1 14447; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14448; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 14449; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 14450; AVX2-FCP-NEXT: vmovaps 736(%rdi), %xmm1 14451; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14452; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1 14453; AVX2-FCP-NEXT: vmovaps 704(%rdi), %xmm2 14454; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14455; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2 14456; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 14457; AVX2-FCP-NEXT: vmovaps 672(%rdi), %xmm3 14458; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14459; AVX2-FCP-NEXT: vmovaps 640(%rdi), %xmm2 14460; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14461; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 14462; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 14463; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14464; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 14465; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14466; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14467; AVX2-FCP-NEXT: vmovaps 1120(%rdi), %xmm0 14468; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14469; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm0 14470; AVX2-FCP-NEXT: vmovaps 1088(%rdi), %xmm1 14471; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14472; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1 14473; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 14474; AVX2-FCP-NEXT: vmovaps 1056(%rdi), %xmm2 14475; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14476; AVX2-FCP-NEXT: vmovaps 1024(%rdi), %xmm1 14477; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14478; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 14479; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 14480; AVX2-FCP-NEXT: vmovaps 1248(%rdi), %xmm1 14481; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14482; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1 14483; AVX2-FCP-NEXT: vmovaps 1216(%rdi), %xmm2 14484; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14485; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2 14486; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 14487; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %xmm3 14488; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14489; AVX2-FCP-NEXT: vmovaps 1152(%rdi), %xmm2 14490; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14491; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 14492; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 14493; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14494; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 14495; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14496; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14497; AVX2-FCP-NEXT: vmovaps 1632(%rdi), %xmm0 14498; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14499; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm0 14500; AVX2-FCP-NEXT: vmovaps 1600(%rdi), %xmm1 14501; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14502; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1 14503; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 14504; AVX2-FCP-NEXT: vmovaps 1568(%rdi), %xmm2 14505; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14506; AVX2-FCP-NEXT: vmovaps 1536(%rdi), %xmm1 14507; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14508; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 14509; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 14510; AVX2-FCP-NEXT: vmovaps 1760(%rdi), %xmm1 14511; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14512; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1 14513; AVX2-FCP-NEXT: vmovaps 1728(%rdi), %xmm2 14514; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14515; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2 14516; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 14517; AVX2-FCP-NEXT: vmovaps 1696(%rdi), %xmm3 14518; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14519; AVX2-FCP-NEXT: vmovaps 1664(%rdi), %xmm2 14520; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14521; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 14522; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 14523; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14524; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 14525; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14526; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14527; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm0 14528; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14529; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm0 14530; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm11 14531; AVX2-FCP-NEXT: vbroadcastss %xmm11, %xmm1 14532; AVX2-FCP-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14533; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 14534; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm2 14535; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14536; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm1 14537; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14538; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 14539; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14540; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 14541; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7] 14542; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm8 14543; AVX2-FCP-NEXT: vbroadcastss %xmm8, %xmm1 14544; AVX2-FCP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14545; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm7 14546; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm2 14547; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14548; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 14549; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm5 14550; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm6 14551; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 14552; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14553; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14554; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3] 14555; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7] 14556; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14557; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 14558; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1] 14559; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] 14560; AVX2-FCP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14561; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 14562; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] 14563; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 14564; AVX2-FCP-NEXT: vmovaps %xmm13, %xmm9 14565; AVX2-FCP-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14566; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 14567; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] 14568; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14569; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 14570; AVX2-FCP-NEXT: # xmm2 = mem[1,1,1,1] 14571; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 14572; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 14573; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 14574; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 14575; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14576; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14577; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 14578; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1] 14579; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 14580; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 14581; AVX2-FCP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14582; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 14583; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] 14584; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 14585; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 14586; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 14587; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] 14588; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14589; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 14590; AVX2-FCP-NEXT: # xmm2 = mem[1,1,1,1] 14591; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 14592; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 14593; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 14594; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 14595; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14596; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14597; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 14598; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1] 14599; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 14600; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 14601; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 14602; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 14603; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 14604; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 14605; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 14606; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 14607; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 14608; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14609; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 14610; AVX2-FCP-NEXT: # xmm2 = mem[1,1,1,1] 14611; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 14612; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 14613; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 14614; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 14615; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14616; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14617; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 14618; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1] 14619; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 14620; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 14621; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 14622; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 14623; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 14624; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 14625; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 14626; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 14627; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 14628; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14629; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 14630; AVX2-FCP-NEXT: # xmm2 = mem[1,1,1,1] 14631; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 14632; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 14633; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 14634; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 14635; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14636; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14637; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] 14638; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] 14639; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] 14640; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 14641; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload 14642; AVX2-FCP-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] 14643; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14644; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 14645; AVX2-FCP-NEXT: # xmm2 = mem[1,1,1,1] 14646; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 14647; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 14648; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 14649; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 14650; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14651; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14652; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 14653; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1] 14654; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 14655; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 14656; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 14657; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 14658; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 14659; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 14660; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 14661; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 14662; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 14663; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14664; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 14665; AVX2-FCP-NEXT: # xmm2 = mem[1,1,1,1] 14666; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 14667; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 14668; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 14669; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 14670; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14671; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14672; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 14673; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1] 14674; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 14675; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 14676; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 14677; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 14678; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 14679; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 14680; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 14681; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 14682; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] 14683; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14684; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 14685; AVX2-FCP-NEXT: # xmm2 = mem[1,1,1,1] 14686; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 14687; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 14688; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 14689; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 14690; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14691; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14692; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 14693; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1] 14694; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 14695; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 14696; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 14697; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 14698; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 14699; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 14700; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 14701; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 14702; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 14703; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14704; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 14705; AVX2-FCP-NEXT: # xmm2 = mem[1,1,1,1] 14706; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 14707; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 14708; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 14709; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 14710; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 14711; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14712; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 14713; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload 14714; AVX2-FCP-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] 14715; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14716; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 14717; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 14718; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 14719; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14720; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2] 14721; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] 14722; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 14723; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14724; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 14725; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 14726; AVX2-FCP-NEXT: # xmm1 = mem[2,2,2,2] 14727; AVX2-FCP-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload 14728; AVX2-FCP-NEXT: # xmm1 = mem[0,1,2],xmm1[3] 14729; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] 14730; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14731; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14732; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 14733; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload 14734; AVX2-FCP-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] 14735; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14736; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 14737; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 14738; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 14739; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14740; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2] 14741; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] 14742; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 14743; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14744; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 14745; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 14746; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2] 14747; AVX2-FCP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 14748; AVX2-FCP-NEXT: # xmm1 = mem[0,1,2],xmm1[3] 14749; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 14750; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14751; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14752; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 14753; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload 14754; AVX2-FCP-NEXT: # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3] 14755; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 14756; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 14757; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 14758; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14759; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2] 14760; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] 14761; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 14762; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14763; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 14764; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 14765; AVX2-FCP-NEXT: # xmm1 = mem[2,2,2,2] 14766; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 14767; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] 14768; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] 14769; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14770; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14771; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 14772; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload 14773; AVX2-FCP-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] 14774; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 14775; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 14776; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 14777; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14778; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 14779; AVX2-FCP-NEXT: # xmm0 = mem[2,2,2,2] 14780; AVX2-FCP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 14781; AVX2-FCP-NEXT: # xmm0 = mem[0,1,2],xmm0[3] 14782; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 14783; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14784; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 14785; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 14786; AVX2-FCP-NEXT: # xmm1 = mem[2,2,2,2] 14787; AVX2-FCP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 14788; AVX2-FCP-NEXT: # xmm1 = mem[0,1,2],xmm1[3] 14789; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] 14790; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14791; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14792; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 14793; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload 14794; AVX2-FCP-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] 14795; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14796; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 14797; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 14798; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 14799; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14800; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,2,2,2] 14801; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] 14802; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 14803; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14804; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 14805; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 14806; AVX2-FCP-NEXT: # xmm1 = mem[2,2,2,2] 14807; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 14808; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] 14809; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 14810; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14811; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14812; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 14813; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload 14814; AVX2-FCP-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] 14815; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14816; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 14817; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 14818; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 14819; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14820; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] 14821; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] 14822; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 14823; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14824; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 14825; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 14826; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] 14827; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 14828; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] 14829; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 14830; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14831; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14832; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 14833; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload 14834; AVX2-FCP-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3] 14835; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14836; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 14837; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 14838; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 14839; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14840; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 14841; AVX2-FCP-NEXT: # xmm0 = mem[2,2,2,2] 14842; AVX2-FCP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 14843; AVX2-FCP-NEXT: # xmm0 = mem[0,1,2],xmm0[3] 14844; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 14845; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15 14846; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] 14847; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 14848; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2] 14849; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 14850; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] 14851; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm5[0,1],xmm15[2,3] 14852; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] 14853; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14854; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 14855; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload 14856; AVX2-FCP-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3] 14857; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 14858; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 14859; AVX2-FCP-NEXT: # xmm15 = mem[2,2,2,2] 14860; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 14861; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3] 14862; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 14863; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm14 14864; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 14865; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 14866; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload 14867; AVX2-FCP-NEXT: # xmm15 = xmm5[2],mem[2],xmm5[3],mem[3] 14868; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 14869; AVX2-FCP-NEXT: # xmm13 = mem[2,2,2,2] 14870; AVX2-FCP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload 14871; AVX2-FCP-NEXT: # xmm13 = mem[0,1,2],xmm13[3] 14872; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] 14873; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm14[4,5,6,7] 14874; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14875; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload 14876; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload 14877; AVX2-FCP-NEXT: # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3] 14878; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 14879; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm5[1],xmm13[1] 14880; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 14881; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload 14882; AVX2-FCP-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3] 14883; AVX2-FCP-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 14884; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 14885; AVX2-FCP-NEXT: # xmm12 = mem[2,3,2,3] 14886; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 14887; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] 14888; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] 14889; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14890; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 14891; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] 14892; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 14893; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm12[1] 14894; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 14895; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload 14896; AVX2-FCP-NEXT: # xmm12 = xmm5[2],mem[2],xmm5[3],mem[3] 14897; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 14898; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 14899; AVX2-FCP-NEXT: # xmm13 = mem[2,3,2,3] 14900; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 14901; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 14902; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] 14903; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14904; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm11 # 16-byte Folded Reload 14905; AVX2-FCP-NEXT: # xmm11 = xmm7[2],mem[2],xmm7[3],mem[3] 14906; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm11[1] 14907; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 14908; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload 14909; AVX2-FCP-NEXT: # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3] 14910; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 14911; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 14912; AVX2-FCP-NEXT: # xmm10 = mem[2,3,2,3] 14913; AVX2-FCP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 14914; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] 14915; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] 14916; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14917; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 14918; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload 14919; AVX2-FCP-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3] 14920; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm9[1] 14921; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 14922; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload 14923; AVX2-FCP-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3] 14924; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 14925; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 14926; AVX2-FCP-NEXT: # xmm8 = mem[2,3,2,3] 14927; AVX2-FCP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 14928; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] 14929; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 14930; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14931; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload 14932; AVX2-FCP-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] 14933; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 14934; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] 14935; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 14936; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload 14937; AVX2-FCP-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] 14938; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 14939; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 14940; AVX2-FCP-NEXT: # xmm6 = mem[2,3,2,3] 14941; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 14942; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] 14943; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] 14944; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14945; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] 14946; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 14947; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] 14948; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 14949; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload 14950; AVX2-FCP-NEXT: # xmm5 = xmm4[2],mem[2],xmm4[3],mem[3] 14951; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 14952; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 14953; AVX2-FCP-NEXT: # xmm4 = mem[2,3,2,3] 14954; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 14955; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 14956; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] 14957; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14958; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 14959; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 14960; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 14961; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 14962; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload 14963; AVX2-FCP-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] 14964; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 14965; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 14966; AVX2-FCP-NEXT: # xmm2 = mem[2,3,2,3] 14967; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 14968; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 14969; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 14970; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14971; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 14972; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 14973; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14974; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 14975; AVX2-FCP-NEXT: # xmm0 = mem[2,3,2,3] 14976; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 14977; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 14978; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 14979; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 14980; AVX2-FCP-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] 14981; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] 14982; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14983; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14984; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0 14985; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14986; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1 14987; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14988; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 14989; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 14990; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1 14991; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14992; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm2 14993; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14994; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 14995; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 14996; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 14997; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 14998; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2 14999; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15000; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm3 15001; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15002; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm15 15003; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm1 15004; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15005; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] 15006; AVX2-FCP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15007; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 15008; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15009; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] 15010; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15011; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15012; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm0 15013; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15014; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm1 15015; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15016; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 15017; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 15018; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm1 15019; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15020; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm2 15021; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15022; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 15023; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15024; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 15025; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 15026; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm2 15027; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15028; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm3 15029; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15030; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm4 15031; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15032; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm1 15033; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15034; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] 15035; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 15036; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] 15037; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15038; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15039; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm0 15040; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15041; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm1 15042; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15043; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 15044; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 15045; AVX2-FCP-NEXT: vmovaps 608(%rdi), %ymm1 15046; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15047; AVX2-FCP-NEXT: vmovaps 576(%rdi), %ymm2 15048; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15049; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 15050; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15051; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 15052; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 15053; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm2 15054; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15055; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm3 15056; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15057; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm4 15058; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm1 15059; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15060; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] 15061; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15062; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 15063; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] 15064; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15065; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15066; AVX2-FCP-NEXT: vmovaps 800(%rdi), %ymm0 15067; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15068; AVX2-FCP-NEXT: vmovaps 768(%rdi), %ymm1 15069; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15070; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 15071; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 15072; AVX2-FCP-NEXT: vmovaps 864(%rdi), %ymm1 15073; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15074; AVX2-FCP-NEXT: vmovaps 832(%rdi), %ymm2 15075; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15076; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 15077; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15078; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 15079; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 15080; AVX2-FCP-NEXT: vmovaps 992(%rdi), %ymm2 15081; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15082; AVX2-FCP-NEXT: vmovaps 960(%rdi), %ymm5 15083; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15084; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm3 15085; AVX2-FCP-NEXT: vmovaps 896(%rdi), %ymm1 15086; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15087; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] 15088; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15089; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] 15090; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] 15091; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15092; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15093; AVX2-FCP-NEXT: vmovaps 1056(%rdi), %ymm0 15094; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15095; AVX2-FCP-NEXT: vmovaps 1024(%rdi), %ymm1 15096; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15097; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 15098; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 15099; AVX2-FCP-NEXT: vmovaps 1120(%rdi), %ymm1 15100; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15101; AVX2-FCP-NEXT: vmovaps 1088(%rdi), %ymm2 15102; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15103; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] 15104; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15105; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 15106; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 15107; AVX2-FCP-NEXT: vmovaps 1248(%rdi), %ymm1 15108; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15109; AVX2-FCP-NEXT: vmovaps 1216(%rdi), %ymm7 15110; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15111; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %ymm2 15112; AVX2-FCP-NEXT: vmovaps 1152(%rdi), %ymm6 15113; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15114; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] 15115; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15116; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[4],ymm1[4],ymm7[5],ymm1[5] 15117; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] 15118; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] 15119; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15120; AVX2-FCP-NEXT: vmovaps 1312(%rdi), %ymm0 15121; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15122; AVX2-FCP-NEXT: vmovaps 1280(%rdi), %ymm6 15123; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15124; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] 15125; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm6 15126; AVX2-FCP-NEXT: vmovaps 1376(%rdi), %ymm0 15127; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15128; AVX2-FCP-NEXT: vmovaps 1344(%rdi), %ymm7 15129; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15130; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5] 15131; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15132; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2] 15133; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] 15134; AVX2-FCP-NEXT: vmovaps 1504(%rdi), %ymm6 15135; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15136; AVX2-FCP-NEXT: vmovaps 1472(%rdi), %ymm10 15137; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15138; AVX2-FCP-NEXT: vmovaps 1440(%rdi), %ymm0 15139; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 15140; AVX2-FCP-NEXT: vmovaps 1408(%rdi), %ymm9 15141; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15142; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] 15143; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] 15144; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] 15145; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] 15146; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15147; AVX2-FCP-NEXT: vmovaps 1568(%rdi), %ymm6 15148; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15149; AVX2-FCP-NEXT: vmovaps 1536(%rdi), %ymm7 15150; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15151; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] 15152; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm9 15153; AVX2-FCP-NEXT: vmovaps 1632(%rdi), %ymm6 15154; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15155; AVX2-FCP-NEXT: vmovaps 1600(%rdi), %ymm7 15156; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15157; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] 15158; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15159; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,2,2,2] 15160; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0,1],xmm10[2,3] 15161; AVX2-FCP-NEXT: vmovaps 1760(%rdi), %ymm9 15162; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15163; AVX2-FCP-NEXT: vmovaps 1728(%rdi), %ymm6 15164; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15165; AVX2-FCP-NEXT: vmovaps 1696(%rdi), %ymm7 15166; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15167; AVX2-FCP-NEXT: vmovaps 1664(%rdi), %ymm11 15168; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15169; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5] 15170; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] 15171; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15172; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[2],ymm6[2] 15173; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] 15174; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15175; AVX2-FCP-NEXT: vmovaps 1824(%rdi), %ymm6 15176; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15177; AVX2-FCP-NEXT: vmovaps 1792(%rdi), %ymm7 15178; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15179; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] 15180; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm11 15181; AVX2-FCP-NEXT: vmovaps 1888(%rdi), %ymm6 15182; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15183; AVX2-FCP-NEXT: vmovaps 1856(%rdi), %ymm7 15184; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15185; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] 15186; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15187; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[2,2,2,2] 15188; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm13[2,3] 15189; AVX2-FCP-NEXT: vmovaps 2016(%rdi), %ymm11 15190; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15191; AVX2-FCP-NEXT: vmovaps 1984(%rdi), %ymm6 15192; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15193; AVX2-FCP-NEXT: vmovaps 1952(%rdi), %ymm7 15194; AVX2-FCP-NEXT: vmovaps 1920(%rdi), %ymm9 15195; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15196; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] 15197; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15198; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5] 15199; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] 15200; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] 15201; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15202; AVX2-FCP-NEXT: vbroadcastss 148(%rdi), %ymm13 15203; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] 15204; AVX2-FCP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload 15205; AVX2-FCP-NEXT: # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] 15206; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 15207; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm14 15208; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 15209; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] 15210; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 15211; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7] 15212; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm15 15213; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] 15214; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] 15215; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15216; AVX2-FCP-NEXT: vbroadcastss 404(%rdi), %ymm13 15217; AVX2-FCP-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload 15218; AVX2-FCP-NEXT: # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7] 15219; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 15220; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 15221; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm13 15222; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 15223; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5] 15224; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 15225; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] 15226; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm14 15227; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] 15228; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] 15229; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15230; AVX2-FCP-NEXT: vbroadcastss 660(%rdi), %ymm12 15231; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7] 15232; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7] 15233; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 15234; AVX2-FCP-NEXT: vextractf128 $1, %ymm8, %xmm8 15235; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 15236; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5] 15237; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 15238; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] 15239; AVX2-FCP-NEXT: vextractf128 $1, %ymm12, %xmm12 15240; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] 15241; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] 15242; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15243; AVX2-FCP-NEXT: vbroadcastss 916(%rdi), %ymm4 15244; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] 15245; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7] 15246; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 15247; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm4 15248; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 15249; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] 15250; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 15251; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7] 15252; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 15253; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] 15254; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 15255; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15256; AVX2-FCP-NEXT: vbroadcastss 1172(%rdi), %ymm3 15257; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] 15258; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] 15259; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15260; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 15261; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 15262; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5] 15263; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 15264; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] 15265; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm3 15266; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] 15267; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 15268; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15269; AVX2-FCP-NEXT: vbroadcastss 1428(%rdi), %ymm1 15270; AVX2-FCP-NEXT: vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload 15271; AVX2-FCP-NEXT: # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7] 15272; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] 15273; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15274; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 15275; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 15276; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] 15277; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 15278; AVX2-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] 15279; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 15280; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] 15281; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15282; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15283; AVX2-FCP-NEXT: vbroadcastss 1684(%rdi), %ymm0 15284; AVX2-FCP-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 15285; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7] 15286; AVX2-FCP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 15287; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] 15288; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15289; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 15290; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 15291; AVX2-FCP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] 15292; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 15293; AVX2-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] 15294; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 15295; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 15296; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 15297; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15298; AVX2-FCP-NEXT: vbroadcastss 1940(%rdi), %ymm0 15299; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] 15300; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] 15301; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15302; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 15303; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 15304; AVX2-FCP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] 15305; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 15306; AVX2-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] 15307; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 15308; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 15309; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 15310; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15311; AVX2-FCP-NEXT: vbroadcastss 248(%rdi), %ymm0 15312; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 15313; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 15314; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15315; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload 15316; AVX2-FCP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 15317; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15318; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] 15319; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15320; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15321; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 15322; AVX2-FCP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 15323; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15324; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm1 15325; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2] 15326; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 15327; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] 15328; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 15329; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15330; AVX2-FCP-NEXT: vbroadcastss 504(%rdi), %ymm0 15331; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 15332; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 15333; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15334; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 15335; AVX2-FCP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 15336; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15337; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[6],ymm10[6],ymm15[7],ymm10[7] 15338; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15339; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15340; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload 15341; AVX2-FCP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 15342; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15343; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm1 15344; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] 15345; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 15346; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] 15347; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 15348; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15349; AVX2-FCP-NEXT: vbroadcastss 760(%rdi), %ymm0 15350; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 15351; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 15352; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15353; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 15354; AVX2-FCP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 15355; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15356; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] 15357; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15358; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15359; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload 15360; AVX2-FCP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 15361; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15362; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm1 15363; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] 15364; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 15365; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] 15366; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 15367; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15368; AVX2-FCP-NEXT: vbroadcastss 1016(%rdi), %ymm0 15369; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 15370; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 15371; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15372; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 15373; AVX2-FCP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 15374; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15375; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7] 15376; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15377; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15378; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload 15379; AVX2-FCP-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 15380; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm1 15381; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm14[2,2,2,2] 15382; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 15383; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] 15384; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 15385; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15386; AVX2-FCP-NEXT: vbroadcastss 1272(%rdi), %ymm0 15387; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 15388; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 15389; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15390; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload 15391; AVX2-FCP-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 15392; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] 15393; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15394; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload 15395; AVX2-FCP-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 15396; AVX2-FCP-NEXT: vextractf128 $1, %ymm13, %xmm1 15397; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2] 15398; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 15399; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] 15400; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 15401; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15402; AVX2-FCP-NEXT: vbroadcastss 1528(%rdi), %ymm0 15403; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 15404; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 15405; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15406; AVX2-FCP-NEXT: vunpckhps (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload 15407; AVX2-FCP-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 15408; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload 15409; AVX2-FCP-NEXT: # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] 15410; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15411; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload 15412; AVX2-FCP-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 15413; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm1 15414; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[2,2,2,2] 15415; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 15416; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] 15417; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 15418; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15419; AVX2-FCP-NEXT: vbroadcastss 1784(%rdi), %ymm0 15420; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 15421; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] 15422; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15423; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 15424; AVX2-FCP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 15425; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15426; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload 15427; AVX2-FCP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 15428; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15429; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload 15430; AVX2-FCP-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 15431; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm1 15432; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[2,2,2,2] 15433; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 15434; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] 15435; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 15436; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15437; AVX2-FCP-NEXT: vbroadcastss 2040(%rdi), %ymm0 15438; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 15439; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7] 15440; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15441; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 15442; AVX2-FCP-NEXT: # ymm4 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 15443; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15444; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 15445; AVX2-FCP-NEXT: # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 15446; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15447; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 15448; AVX2-FCP-NEXT: # ymm2 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 15449; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm0 15450; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[2,2,2,2] 15451; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] 15452; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] 15453; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15454; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 15455; AVX2-FCP-NEXT: vbroadcastss 220(%rdi), %ymm0 15456; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 15457; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 15458; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15459; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 15460; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15461; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 15462; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 15463; AVX2-FCP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] 15464; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm15 15465; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] 15466; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 15467; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15468; AVX2-FCP-NEXT: vbroadcastss 476(%rdi), %ymm0 15469; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 15470; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 15471; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15472; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 15473; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15474; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 15475; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 15476; AVX2-FCP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] 15477; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm15 15478; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] 15479; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 15480; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 15481; AVX2-FCP-NEXT: vbroadcastss 732(%rdi), %ymm0 15482; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 15483; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 15484; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15485; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 15486; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15487; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 15488; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 15489; AVX2-FCP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] 15490; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm15 15491; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] 15492; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] 15493; AVX2-FCP-NEXT: vbroadcastss 988(%rdi), %ymm0 15494; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 15495; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 15496; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 15497; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 15498; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm1 15499; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload 15500; AVX2-FCP-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] 15501; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm14 15502; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] 15503; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] 15504; AVX2-FCP-NEXT: vbroadcastss 1244(%rdi), %ymm0 15505; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 15506; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 15507; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] 15508; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm1 15509; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,3,2,3,6,7,6,7] 15510; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm11 15511; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] 15512; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 15513; AVX2-FCP-NEXT: vbroadcastss 1500(%rdi), %ymm0 15514; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 15515; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 15516; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] 15517; AVX2-FCP-NEXT: vextractf128 $1, %ymm8, %xmm8 15518; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] 15519; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm9 15520; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] 15521; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] 15522; AVX2-FCP-NEXT: vbroadcastss 1756(%rdi), %ymm0 15523; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 15524; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 15525; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] 15526; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 15527; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,2,3,6,7,6,7] 15528; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 15529; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] 15530; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] 15531; AVX2-FCP-NEXT: vbroadcastss 2012(%rdi), %ymm0 15532; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 15533; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 15534; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] 15535; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 15536; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] 15537; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm3 15538; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] 15539; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 15540; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15541; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%rsi) 15542; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15543; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rsi) 15544; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15545; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rsi) 15546; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15547; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi) 15548; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15549; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%rsi) 15550; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15551; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%rsi) 15552; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15553; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rsi) 15554; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15555; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rsi) 15556; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15557; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%rdx) 15558; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15559; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rdx) 15560; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15561; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rdx) 15562; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15563; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx) 15564; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15565; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%rdx) 15566; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15567; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%rdx) 15568; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15569; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rdx) 15570; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15571; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rdx) 15572; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15573; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%rcx) 15574; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15575; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rcx) 15576; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15577; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rcx) 15578; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15579; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx) 15580; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15581; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%rcx) 15582; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15583; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%rcx) 15584; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15585; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rcx) 15586; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15587; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rcx) 15588; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15589; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%r8) 15590; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15591; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%r8) 15592; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15593; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%r8) 15594; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15595; AVX2-FCP-NEXT: vmovaps %ymm2, (%r8) 15596; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15597; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%r8) 15598; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15599; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%r8) 15600; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15601; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%r8) 15602; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15603; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%r8) 15604; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15605; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%r9) 15606; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15607; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%r9) 15608; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15609; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%r9) 15610; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15611; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%r9) 15612; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15613; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%r9) 15614; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15615; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%r9) 15616; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15617; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%r9) 15618; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15619; AVX2-FCP-NEXT: vmovaps %ymm2, (%r9) 15620; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 15621; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15622; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%rax) 15623; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15624; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%rax) 15625; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15626; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%rax) 15627; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15628; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rax) 15629; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15630; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rax) 15631; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15632; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rax) 15633; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15634; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rax) 15635; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15636; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax) 15637; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 15638; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload 15639; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%rax) 15640; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15641; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%rax) 15642; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15643; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%rax) 15644; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15645; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rax) 15646; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15647; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rax) 15648; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15649; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rax) 15650; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15651; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rax) 15652; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 15653; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax) 15654; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 15655; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax) 15656; AVX2-FCP-NEXT: vmovaps %ymm5, 192(%rax) 15657; AVX2-FCP-NEXT: vmovaps %ymm8, 160(%rax) 15658; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rax) 15659; AVX2-FCP-NEXT: vmovaps %ymm14, 96(%rax) 15660; AVX2-FCP-NEXT: vmovaps %ymm15, 64(%rax) 15661; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15662; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) 15663; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 15664; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) 15665; AVX2-FCP-NEXT: addq $3528, %rsp # imm = 0xDC8 15666; AVX2-FCP-NEXT: vzeroupper 15667; AVX2-FCP-NEXT: retq 15668; 15669; AVX512-LABEL: load_i32_stride8_vf64: 15670; AVX512: # %bb.0: 15671; AVX512-NEXT: subq $3144, %rsp # imm = 0xC48 15672; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm11 15673; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15674; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm18 15675; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm31 15676; AVX512-NEXT: vmovaps 1536(%rdi), %zmm0 15677; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill 15678; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm24 15679; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15680; AVX512-NEXT: vmovaps 1664(%rdi), %zmm0 15681; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15682; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm21 15683; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm26 15684; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm22 15685; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm5 15686; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm13 15687; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm3 15688; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm30 15689; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm2 15690; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm29 15691; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm27 15692; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm20 15693; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm10 15694; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm25 15695; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm7 15696; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm9 15697; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15698; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm12 15699; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm6 15700; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15701; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm28 15702; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm23 15703; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm4 15704; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 15705; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 15706; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 15707; AVX512-NEXT: vmovdqa64 %zmm4, %zmm16 15708; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 15709; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 15710; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 15711; AVX512-NEXT: movb $-64, %al 15712; AVX512-NEXT: kmovw %eax, %k1 15713; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15714; AVX512-NEXT: vmovdqa64 %zmm12, %zmm1 15715; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 15716; AVX512-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 15717; AVX512-NEXT: vmovdqa64 %zmm7, %zmm12 15718; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 15719; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 15720; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 15721; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 15722; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15723; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 15724; AVX512-NEXT: vmovdqa64 %zmm10, %zmm6 15725; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15726; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 15727; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 15728; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 15729; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15730; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 15731; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 15732; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 15733; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 15734; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 15735; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 15736; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 15737; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 15738; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15739; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 15740; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15741; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 15742; AVX512-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 15743; AVX512-NEXT: vmovdqa64 %zmm22, %zmm17 15744; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15745; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 15746; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 15747; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15748; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 15749; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 15750; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 15751; AVX512-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload 15752; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 15753; AVX512-NEXT: vmovdqa64 %zmm31, %zmm24 15754; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 15755; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 15756; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm5 15757; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15758; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 15759; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15760; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 15761; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 15762; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm22 15763; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 15764; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 15765; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15766; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 15767; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15768; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5 15769; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15770; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 15771; AVX512-NEXT: vmovdqa64 (%rdi), %zmm7 15772; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm19 15773; AVX512-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 15774; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15775; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15776; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 15777; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15778; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 15779; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 15780; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 15781; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 15782; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 15783; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15784; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 15785; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15786; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15787; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 15788; AVX512-NEXT: vmovdqa64 %zmm30, %zmm31 15789; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 15790; AVX512-NEXT: vmovdqa64 %zmm10, %zmm12 15791; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15792; AVX512-NEXT: vmovdqa64 %zmm13, %zmm30 15793; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 15794; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 15795; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 15796; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15797; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15798; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1 15799; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 15800; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 15801; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 15802; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 15803; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15804; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15805; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 15806; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload 15807; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 15808; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15809; AVX512-NEXT: vmovdqa64 %zmm9, %zmm12 15810; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 15811; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 15812; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 15813; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15814; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1 15815; AVX512-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 15816; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 15817; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 15818; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15819; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 15820; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 15821; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 15822; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 15823; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 15824; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 15825; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 15826; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15827; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 15828; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 15829; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 15830; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 15831; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 15832; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 15833; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15834; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 15835; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1 15836; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 15837; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 15838; AVX512-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 15839; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15840; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 15841; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15842; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 15843; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 15844; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 15845; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 15846; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 15847; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 15848; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 15849; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15850; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 15851; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 15852; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] 15853; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 15854; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15855; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1 15856; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 15857; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 15858; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 15859; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15860; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 15861; AVX512-NEXT: vmovdqa64 %zmm13, %zmm16 15862; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 15863; AVX512-NEXT: vmovdqa64 %zmm25, %zmm29 15864; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] 15865; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 15866; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15867; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 15868; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 15869; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 15870; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 15871; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15872; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 15873; AVX512-NEXT: vmovdqa64 %zmm21, %zmm7 15874; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 15875; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15876; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 15877; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 15878; AVX512-NEXT: vmovdqa64 %zmm3, %zmm25 15879; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 15880; AVX512-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload 15881; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 15882; AVX512-NEXT: vmovdqa64 %zmm24, %zmm15 15883; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 15884; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 15885; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 15886; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15887; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 15888; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 15889; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 15890; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 15891; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 15892; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15893; AVX512-NEXT: vmovdqa64 %zmm14, %zmm6 15894; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1 15895; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 15896; AVX512-NEXT: vmovdqa64 %zmm11, %zmm24 15897; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 15898; AVX512-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 15899; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15900; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 15901; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15902; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 15903; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 15904; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 15905; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 15906; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 15907; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 15908; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 15909; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15910; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 15911; AVX512-NEXT: vmovdqa64 %zmm31, %zmm21 15912; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 15913; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 15914; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 15915; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 15916; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 15917; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15918; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 15919; AVX512-NEXT: vmovdqa64 %zmm11, %zmm1 15920; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15921; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 15922; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 15923; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 15924; AVX512-NEXT: vmovdqa64 %zmm5, %zmm31 15925; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15926; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 15927; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 15928; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 15929; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 15930; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 15931; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 15932; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15933; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 15934; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 15935; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 15936; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 15937; AVX512-NEXT: vmovdqa64 %zmm7, %zmm26 15938; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15939; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 15940; AVX512-NEXT: vmovdqa64 %zmm25, %zmm9 15941; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 15942; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 15943; AVX512-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 15944; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 15945; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 15946; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15947; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 15948; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 15949; AVX512-NEXT: vmovdqa64 %zmm18, %zmm25 15950; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 15951; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 15952; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 15953; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15954; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 15955; AVX512-NEXT: vmovdqa64 %zmm24, %zmm18 15956; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 15957; AVX512-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 15958; AVX512-NEXT: vmovdqa64 %zmm19, %zmm24 15959; AVX512-NEXT: vmovdqa64 %zmm14, %zmm13 15960; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 15961; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 15962; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15963; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 15964; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 15965; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 15966; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15967; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 15968; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15969; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 15970; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 15971; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 15972; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15973; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 15974; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 15975; AVX512-NEXT: vmovdqa64 %zmm21, %zmm6 15976; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15977; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 15978; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 15979; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15980; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 15981; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 15982; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 15983; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15984; AVX512-NEXT: vmovdqa64 %zmm11, %zmm1 15985; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 15986; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 15987; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 15988; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 15989; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 15990; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 15991; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 15992; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 15993; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 15994; AVX512-NEXT: vmovdqa64 %zmm29, %zmm11 15995; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 15996; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 15997; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 15998; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15999; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload 16000; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 16001; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 16002; AVX512-NEXT: vmovdqa64 %zmm10, %zmm31 16003; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 16004; AVX512-NEXT: vmovdqa64 %zmm10, %zmm4 16005; AVX512-NEXT: vmovdqa64 %zmm26, %zmm19 16006; AVX512-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 16007; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16008; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 16009; AVX512-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 16010; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 16011; AVX512-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload 16012; AVX512-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 16013; AVX512-NEXT: vmovdqa64 %zmm15, %zmm29 16014; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16015; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16016; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16017; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16018; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload 16019; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1 16020; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 16021; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 16022; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 16023; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 16024; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16025; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 16026; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 16027; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 16028; AVX512-NEXT: vmovdqa64 %zmm13, %zmm5 16029; AVX512-NEXT: vmovdqa64 %zmm24, %zmm13 16030; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16031; AVX512-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 16032; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16033; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 16034; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16035; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 16036; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16037; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 16038; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 16039; AVX512-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} 16040; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 16041; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 16042; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 16043; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 16044; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16045; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 16046; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16047; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload 16048; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 16049; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 16050; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 16051; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 16052; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 16053; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 16054; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16055; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 16056; AVX512-NEXT: vmovdqa64 %zmm8, %zmm15 16057; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 16058; AVX512-NEXT: vmovdqa64 %zmm16, %zmm8 16059; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 16060; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 16061; AVX512-NEXT: vmovdqa64 %zmm11, %zmm2 16062; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16063; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16064; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16065; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 16066; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 16067; AVX512-NEXT: vmovdqa64 %zmm31, %zmm16 16068; AVX512-NEXT: vmovdqa64 %zmm10, %zmm4 16069; AVX512-NEXT: vmovdqa64 %zmm10, %zmm27 16070; AVX512-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 16071; AVX512-NEXT: vmovdqa64 %zmm19, %zmm20 16072; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16073; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 16074; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 16075; AVX512-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 16076; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17 16077; AVX512-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload 16078; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 16079; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 16080; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16081; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16082; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16083; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1 16084; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 16085; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 16086; AVX512-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 16087; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16088; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 16089; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 16090; AVX512-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 16091; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16092; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 16093; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16094; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 16095; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16096; AVX512-NEXT: vmovdqa64 %zmm24, %zmm4 16097; AVX512-NEXT: vmovdqa64 %zmm24, %zmm30 16098; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 16099; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 16100; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16101; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 16102; AVX512-NEXT: vmovdqa64 %zmm4, %zmm19 16103; AVX512-NEXT: vmovdqa64 %zmm28, %zmm11 16104; AVX512-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 16105; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 16106; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 16107; AVX512-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 16108; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 16109; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16110; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 16111; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 16112; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 16113; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 16114; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16115; AVX512-NEXT: vmovdqa64 %zmm23, %zmm31 16116; AVX512-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 16117; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 16118; AVX512-NEXT: vmovdqa64 %zmm23, %zmm16 16119; AVX512-NEXT: vmovdqa64 %zmm27, %zmm29 16120; AVX512-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 16121; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 16122; AVX512-NEXT: vmovdqa64 %zmm27, %zmm26 16123; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2 16124; AVX512-NEXT: vmovdqa64 %zmm10, %zmm5 16125; AVX512-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 16126; AVX512-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 16127; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16128; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 16129; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 16130; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 16131; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 16132; AVX512-NEXT: vmovdqa64 %zmm6, %zmm15 16133; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 16134; AVX512-NEXT: vmovdqa64 %zmm3, %zmm23 16135; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 16136; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 16137; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 16138; AVX512-NEXT: vmovdqa64 %zmm3, %zmm17 16139; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 16140; AVX512-NEXT: vmovdqa64 %zmm3, %zmm24 16141; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 16142; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 16143; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 16144; AVX512-NEXT: vmovdqa64 %zmm3, %zmm27 16145; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 16146; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 16147; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 16148; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 16149; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 16150; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 16151; AVX512-NEXT: vmovdqa64 %zmm10, %zmm3 16152; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16153; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 16154; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 16155; AVX512-NEXT: vmovdqa64 %zmm21, %zmm6 16156; AVX512-NEXT: vmovdqa64 %zmm21, %zmm20 16157; AVX512-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 16158; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 16159; AVX512-NEXT: vmovdqa64 %zmm6, %zmm21 16160; AVX512-NEXT: vmovdqa64 %zmm22, %zmm25 16161; AVX512-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 16162; AVX512-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 16163; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 16164; AVX512-NEXT: vmovdqa64 %zmm7, %zmm14 16165; AVX512-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 16166; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 16167; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 16168; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16169; AVX512-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 16170; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 16171; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 16172; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] 16173; AVX512-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} 16174; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 16175; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] 16176; AVX512-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} 16177; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 16178; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] 16179; AVX512-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} 16180; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 16181; AVX512-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} 16182; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] 16183; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 16184; AVX512-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} 16185; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 16186; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 16187; AVX512-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 16188; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 16189; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} 16190; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload 16191; AVX512-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] 16192; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 16193; AVX512-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} 16194; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] 16195; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 16196; AVX512-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} 16197; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] 16198; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 16199; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 16200; AVX512-NEXT: vmovaps %zmm8, 192(%rsi) 16201; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 16202; AVX512-NEXT: vmovaps %zmm8, 128(%rsi) 16203; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 16204; AVX512-NEXT: vmovaps %zmm8, 64(%rsi) 16205; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16206; AVX512-NEXT: vmovaps %zmm6, (%rsi) 16207; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16208; AVX512-NEXT: vmovaps %zmm6, 192(%rdx) 16209; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16210; AVX512-NEXT: vmovaps %zmm6, (%rdx) 16211; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16212; AVX512-NEXT: vmovaps %zmm6, 64(%rdx) 16213; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16214; AVX512-NEXT: vmovaps %zmm6, 128(%rdx) 16215; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16216; AVX512-NEXT: vmovaps %zmm6, 192(%rcx) 16217; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16218; AVX512-NEXT: vmovaps %zmm6, (%rcx) 16219; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16220; AVX512-NEXT: vmovaps %zmm6, 64(%rcx) 16221; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16222; AVX512-NEXT: vmovaps %zmm6, 128(%rcx) 16223; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16224; AVX512-NEXT: vmovaps %zmm6, 192(%r8) 16225; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16226; AVX512-NEXT: vmovaps %zmm6, (%r8) 16227; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16228; AVX512-NEXT: vmovaps %zmm6, 64(%r8) 16229; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16230; AVX512-NEXT: vmovaps %zmm6, 128(%r8) 16231; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16232; AVX512-NEXT: vmovaps %zmm6, 192(%r9) 16233; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16234; AVX512-NEXT: vmovaps %zmm6, (%r9) 16235; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16236; AVX512-NEXT: vmovaps %zmm6, 64(%r9) 16237; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16238; AVX512-NEXT: vmovaps %zmm6, 128(%r9) 16239; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 16240; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16241; AVX512-NEXT: vmovaps %zmm6, 192(%rax) 16242; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16243; AVX512-NEXT: vmovaps %zmm6, (%rax) 16244; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16245; AVX512-NEXT: vmovaps %zmm6, 64(%rax) 16246; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16247; AVX512-NEXT: vmovaps %zmm6, 128(%rax) 16248; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 16249; AVX512-NEXT: vmovdqa64 %zmm3, 192(%rax) 16250; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) 16251; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax) 16252; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) 16253; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 16254; AVX512-NEXT: vmovdqa64 %zmm9, 128(%rax) 16255; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rax) 16256; AVX512-NEXT: vmovdqa64 %zmm7, (%rax) 16257; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rax) 16258; AVX512-NEXT: addq $3144, %rsp # imm = 0xC48 16259; AVX512-NEXT: vzeroupper 16260; AVX512-NEXT: retq 16261; 16262; AVX512-FCP-LABEL: load_i32_stride8_vf64: 16263; AVX512-FCP: # %bb.0: 16264; AVX512-FCP-NEXT: subq $3144, %rsp # imm = 0xC48 16265; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11 16266; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16267; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 16268; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 16269; AVX512-FCP-NEXT: vmovaps 1536(%rdi), %zmm0 16270; AVX512-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill 16271; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24 16272; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16273; AVX512-FCP-NEXT: vmovaps 1664(%rdi), %zmm0 16274; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16275; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21 16276; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 16277; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22 16278; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 16279; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 16280; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 16281; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm30 16282; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 16283; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm29 16284; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 16285; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 16286; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 16287; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 16288; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 16289; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 16290; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16291; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 16292; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 16293; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16294; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 16295; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 16296; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 16297; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 16298; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16299; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 16300; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 16301; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 16302; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 16303; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 16304; AVX512-FCP-NEXT: movb $-64, %al 16305; AVX512-FCP-NEXT: kmovw %eax, %k1 16306; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16307; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 16308; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 16309; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 16310; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 16311; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 16312; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 16313; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16314; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16315; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16316; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 16317; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 16318; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16319; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 16320; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 16321; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 16322; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16323; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 16324; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 16325; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 16326; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 16327; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 16328; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 16329; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16330; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16331; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16332; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 16333; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16334; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 16335; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 16336; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 16337; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16338; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 16339; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 16340; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16341; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 16342; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 16343; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 16344; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload 16345; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 16346; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 16347; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 16348; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16349; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 16350; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16351; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16352; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16353; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 16354; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 16355; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 16356; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 16357; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 16358; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16359; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 16360; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16361; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 16362; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16363; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 16364; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 16365; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm19 16366; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 16367; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16368; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16369; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 16370; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16371; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 16372; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16373; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 16374; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 16375; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 16376; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16377; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 16378; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16379; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16380; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 16381; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm31 16382; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 16383; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 16384; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16385; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 16386; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 16387; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16388; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16389; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16390; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16391; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 16392; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 16393; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 16394; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 16395; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 16396; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16397; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16398; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 16399; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload 16400; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 16401; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16402; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 16403; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 16404; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16405; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16406; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16407; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 16408; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 16409; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 16410; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 16411; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16412; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 16413; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 16414; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 16415; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 16416; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 16417; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16418; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16419; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16420; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16421; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 16422; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 16423; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 16424; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 16425; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 16426; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16427; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 16428; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 16429; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 16430; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 16431; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 16432; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16433; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 16434; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16435; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 16436; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16437; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 16438; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 16439; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 16440; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 16441; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 16442; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16443; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 16444; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 16445; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] 16446; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16447; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16448; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 16449; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 16450; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 16451; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 16452; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16453; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 16454; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 16455; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 16456; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 16457; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] 16458; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16459; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16460; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 16461; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 16462; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 16463; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 16464; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16465; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 16466; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm7 16467; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 16468; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16469; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 16470; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 16471; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 16472; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 16473; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload 16474; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 16475; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm15 16476; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 16477; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16478; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16479; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16480; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 16481; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 16482; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 16483; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 16484; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 16485; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16486; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 16487; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 16488; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 16489; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 16490; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 16491; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 16492; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16493; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 16494; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16495; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 16496; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16497; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 16498; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 16499; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 16500; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 16501; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 16502; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16503; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 16504; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm21 16505; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 16506; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 16507; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 16508; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16509; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16510; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16511; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 16512; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 16513; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16514; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 16515; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 16516; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 16517; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm31 16518; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16519; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 16520; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 16521; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 16522; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 16523; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16524; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16525; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16526; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 16527; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 16528; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 16529; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 16530; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 16531; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16532; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 16533; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 16534; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 16535; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 16536; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 16537; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16538; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16539; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16540; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 16541; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 16542; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 16543; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 16544; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 16545; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 16546; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16547; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 16548; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 16549; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 16550; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 16551; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm24 16552; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 16553; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16554; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 16555; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16556; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 16557; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16558; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 16559; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16560; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 16561; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16562; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 16563; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 16564; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 16565; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16566; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 16567; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 16568; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 16569; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16570; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 16571; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 16572; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16573; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 16574; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16575; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16576; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16577; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 16578; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 16579; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 16580; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 16581; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16582; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 16583; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 16584; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 16585; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 16586; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 16587; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm11 16588; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 16589; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16590; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16591; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16592; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload 16593; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 16594; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 16595; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 16596; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 16597; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 16598; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 16599; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 16600; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16601; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 16602; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 16603; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 16604; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload 16605; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 16606; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 16607; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16608; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16609; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16610; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16611; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload 16612; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 16613; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 16614; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 16615; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 16616; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 16617; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16618; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 16619; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 16620; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 16621; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 16622; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 16623; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16624; AVX512-FCP-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 16625; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16626; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 16627; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16628; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 16629; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16630; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 16631; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 16632; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} 16633; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 16634; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 16635; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 16636; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 16637; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16638; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 16639; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16640; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload 16641; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 16642; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 16643; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 16644; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 16645; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 16646; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 16647; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16648; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 16649; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 16650; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 16651; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 16652; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 16653; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 16654; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 16655; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16656; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16657; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16658; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 16659; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 16660; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm16 16661; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 16662; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm27 16663; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 16664; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 16665; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16666; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 16667; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 16668; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 16669; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 16670; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload 16671; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 16672; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 16673; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16674; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16675; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16676; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 16677; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 16678; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 16679; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 16680; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16681; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 16682; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 16683; AVX512-FCP-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 16684; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16685; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 16686; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16687; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 16688; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16689; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 16690; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 16691; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 16692; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 16693; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16694; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 16695; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 16696; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 16697; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 16698; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 16699; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 16700; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 16701; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 16702; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16703; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 16704; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 16705; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 16706; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 16707; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16708; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 16709; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 16710; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 16711; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 16712; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm29 16713; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 16714; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 16715; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 16716; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 16717; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 16718; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 16719; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 16720; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16721; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 16722; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 16723; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 16724; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 16725; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 16726; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 16727; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 16728; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 16729; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 16730; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 16731; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 16732; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 16733; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 16734; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 16735; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 16736; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 16737; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 16738; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 16739; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 16740; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 16741; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 16742; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 16743; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 16744; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 16745; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16746; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 16747; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 16748; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 16749; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 16750; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 16751; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 16752; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 16753; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 16754; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 16755; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 16756; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 16757; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 16758; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 16759; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 16760; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 16761; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16762; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 16763; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 16764; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 16765; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] 16766; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} 16767; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 16768; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] 16769; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} 16770; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 16771; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] 16772; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} 16773; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 16774; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} 16775; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] 16776; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 16777; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} 16778; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 16779; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 16780; AVX512-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 16781; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 16782; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} 16783; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload 16784; AVX512-FCP-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] 16785; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 16786; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} 16787; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] 16788; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 16789; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} 16790; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] 16791; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 16792; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 16793; AVX512-FCP-NEXT: vmovaps %zmm8, 192(%rsi) 16794; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 16795; AVX512-FCP-NEXT: vmovaps %zmm8, 128(%rsi) 16796; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 16797; AVX512-FCP-NEXT: vmovaps %zmm8, 64(%rsi) 16798; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16799; AVX512-FCP-NEXT: vmovaps %zmm6, (%rsi) 16800; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16801; AVX512-FCP-NEXT: vmovaps %zmm6, 192(%rdx) 16802; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16803; AVX512-FCP-NEXT: vmovaps %zmm6, (%rdx) 16804; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16805; AVX512-FCP-NEXT: vmovaps %zmm6, 64(%rdx) 16806; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16807; AVX512-FCP-NEXT: vmovaps %zmm6, 128(%rdx) 16808; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16809; AVX512-FCP-NEXT: vmovaps %zmm6, 192(%rcx) 16810; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16811; AVX512-FCP-NEXT: vmovaps %zmm6, (%rcx) 16812; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16813; AVX512-FCP-NEXT: vmovaps %zmm6, 64(%rcx) 16814; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16815; AVX512-FCP-NEXT: vmovaps %zmm6, 128(%rcx) 16816; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16817; AVX512-FCP-NEXT: vmovaps %zmm6, 192(%r8) 16818; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16819; AVX512-FCP-NEXT: vmovaps %zmm6, (%r8) 16820; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16821; AVX512-FCP-NEXT: vmovaps %zmm6, 64(%r8) 16822; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16823; AVX512-FCP-NEXT: vmovaps %zmm6, 128(%r8) 16824; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16825; AVX512-FCP-NEXT: vmovaps %zmm6, 192(%r9) 16826; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16827; AVX512-FCP-NEXT: vmovaps %zmm6, (%r9) 16828; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16829; AVX512-FCP-NEXT: vmovaps %zmm6, 64(%r9) 16830; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16831; AVX512-FCP-NEXT: vmovaps %zmm6, 128(%r9) 16832; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 16833; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16834; AVX512-FCP-NEXT: vmovaps %zmm6, 192(%rax) 16835; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16836; AVX512-FCP-NEXT: vmovaps %zmm6, (%rax) 16837; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16838; AVX512-FCP-NEXT: vmovaps %zmm6, 64(%rax) 16839; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 16840; AVX512-FCP-NEXT: vmovaps %zmm6, 128(%rax) 16841; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 16842; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) 16843; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) 16844; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) 16845; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) 16846; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 16847; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) 16848; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax) 16849; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rax) 16850; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) 16851; AVX512-FCP-NEXT: addq $3144, %rsp # imm = 0xC48 16852; AVX512-FCP-NEXT: vzeroupper 16853; AVX512-FCP-NEXT: retq 16854; 16855; AVX512DQ-LABEL: load_i32_stride8_vf64: 16856; AVX512DQ: # %bb.0: 16857; AVX512DQ-NEXT: subq $3144, %rsp # imm = 0xC48 16858; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm11 16859; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16860; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm18 16861; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm31 16862; AVX512DQ-NEXT: vmovaps 1536(%rdi), %zmm0 16863; AVX512DQ-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill 16864; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm24 16865; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16866; AVX512DQ-NEXT: vmovaps 1664(%rdi), %zmm0 16867; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16868; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm21 16869; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm26 16870; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm22 16871; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm5 16872; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm13 16873; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm3 16874; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm30 16875; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm2 16876; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm29 16877; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm27 16878; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm20 16879; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm10 16880; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm25 16881; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm7 16882; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm9 16883; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16884; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm12 16885; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm6 16886; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16887; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm28 16888; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm23 16889; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm4 16890; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 16891; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16892; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 16893; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm16 16894; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 16895; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 16896; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 16897; AVX512DQ-NEXT: movb $-64, %al 16898; AVX512DQ-NEXT: kmovw %eax, %k1 16899; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16900; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm1 16901; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 16902; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 16903; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm12 16904; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 16905; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 16906; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16907; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16908; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16909; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 16910; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm6 16911; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16912; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 16913; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 16914; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 16915; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16916; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 16917; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 16918; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 16919; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 16920; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 16921; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 16922; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16923; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16924; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16925; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14 16926; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16927; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 16928; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 16929; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm17 16930; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16931; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 16932; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 16933; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16934; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 16935; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 16936; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 16937; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload 16938; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 16939; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm24 16940; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 16941; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16942; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm5 16943; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16944; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16945; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16946; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 16947; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 16948; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm22 16949; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 16950; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 16951; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16952; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1 16953; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16954; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm5 16955; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16956; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 16957; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm7 16958; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm19 16959; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 16960; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16961; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 16962; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 16963; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16964; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 16965; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16966; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 16967; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 16968; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 16969; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16970; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 16971; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16972; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16973; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 16974; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm31 16975; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 16976; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm12 16977; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16978; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm30 16979; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 16980; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16981; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16982; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16983; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16984; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 16985; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 16986; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 16987; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 16988; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 16989; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 16990; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16991; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1 16992; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload 16993; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 16994; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16995; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm12 16996; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 16997; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 16998; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 16999; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17000; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 17001; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 17002; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 17003; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 17004; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17005; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 17006; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 17007; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 17008; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 17009; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 17010; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17011; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17012; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17013; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17014; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 17015; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 17016; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 17017; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 17018; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 17019; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17020; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 17021; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 17022; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 17023; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 17024; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 17025; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17026; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 17027; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17028; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 17029; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17030; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 17031; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 17032; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 17033; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 17034; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 17035; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17036; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 17037; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 17038; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] 17039; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17040; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17041; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 17042; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 17043; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 17044; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 17045; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17046; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 17047; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm16 17048; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 17049; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm29 17050; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] 17051; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17052; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17053; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 17054; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 17055; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 17056; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 17057; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17058; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 17059; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm7 17060; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 17061; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17062; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 17063; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 17064; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm25 17065; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 17066; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload 17067; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 17068; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm15 17069; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 17070; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17071; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17072; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17073; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 17074; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 17075; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 17076; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 17077; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 17078; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17079; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm6 17080; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 17081; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 17082; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm24 17083; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 17084; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 17085; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17086; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 17087; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17088; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 17089; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17090; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 17091; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 17092; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 17093; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 17094; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 17095; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17096; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 17097; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm21 17098; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 17099; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 17100; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 17101; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17102; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17103; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17104; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 17105; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1 17106; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17107; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 17108; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 17109; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 17110; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm31 17111; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17112; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 17113; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 17114; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 17115; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 17116; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17117; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17118; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17119; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 17120; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 17121; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 17122; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 17123; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm26 17124; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17125; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 17126; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm9 17127; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 17128; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 17129; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 17130; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17131; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17132; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17133; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 17134; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 17135; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm25 17136; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 17137; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 17138; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 17139; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17140; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 17141; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm18 17142; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 17143; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 17144; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm24 17145; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm13 17146; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17147; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 17148; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17149; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 17150; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17151; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 17152; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17153; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 17154; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17155; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 17156; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 17157; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 17158; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17159; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 17160; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 17161; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm6 17162; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17163; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 17164; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 17165; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17166; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 17167; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17168; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17169; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17170; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1 17171; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 17172; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 17173; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 17174; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17175; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 17176; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 17177; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 17178; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 17179; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 17180; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm11 17181; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 17182; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17183; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17184; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17185; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload 17186; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1 17187; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 17188; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm31 17189; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 17190; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm4 17191; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm19 17192; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 17193; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17194; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 17195; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 17196; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm26 17197; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload 17198; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 17199; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm29 17200; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17201; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17202; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17203; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17204; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload 17205; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1 17206; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 17207; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 17208; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14 17209; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 17210; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17211; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 17212; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 17213; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 17214; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5 17215; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm13 17216; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17217; AVX512DQ-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 17218; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17219; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 17220; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17221; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 17222; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17223; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 17224; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 17225; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} 17226; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 17227; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 17228; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 17229; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 17230; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17231; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 17232; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17233; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload 17234; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm1 17235; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 17236; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 17237; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 17238; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 17239; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 17240; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17241; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 17242; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm15 17243; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 17244; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm8 17245; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 17246; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 17247; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm2 17248; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17249; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17250; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17251; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1 17252; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 17253; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm16 17254; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm4 17255; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm27 17256; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 17257; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm20 17258; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17259; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 17260; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 17261; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 17262; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm17 17263; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload 17264; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 17265; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 17266; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17267; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17268; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17269; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1 17270; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 17271; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 17272; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 17273; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17274; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 17275; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 17276; AVX512DQ-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 17277; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17278; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 17279; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17280; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 17281; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17282; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm4 17283; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm30 17284; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 17285; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 17286; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17287; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 17288; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm19 17289; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm11 17290; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 17291; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 17292; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm9 17293; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 17294; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 17295; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17296; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 17297; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm8 17298; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 17299; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 17300; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17301; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm31 17302; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 17303; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 17304; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm16 17305; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm29 17306; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 17307; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 17308; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm26 17309; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm2 17310; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm5 17311; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 17312; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 17313; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17314; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm4 17315; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 17316; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 17317; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 17318; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm15 17319; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 17320; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm23 17321; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 17322; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 17323; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 17324; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm17 17325; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 17326; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm24 17327; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 17328; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 17329; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 17330; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm27 17331; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 17332; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm13 17333; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 17334; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 17335; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 17336; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 17337; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm3 17338; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17339; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 17340; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 17341; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm6 17342; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm20 17343; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 17344; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 17345; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm21 17346; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm25 17347; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 17348; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 17349; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 17350; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm14 17351; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 17352; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 17353; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 17354; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17355; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 17356; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 17357; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6 17358; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] 17359; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} 17360; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 17361; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] 17362; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} 17363; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 17364; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] 17365; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} 17366; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 17367; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} 17368; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] 17369; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 17370; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} 17371; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 17372; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 17373; AVX512DQ-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 17374; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 17375; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} 17376; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload 17377; AVX512DQ-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] 17378; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 17379; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} 17380; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] 17381; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 17382; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} 17383; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] 17384; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 17385; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 17386; AVX512DQ-NEXT: vmovaps %zmm8, 192(%rsi) 17387; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 17388; AVX512DQ-NEXT: vmovaps %zmm8, 128(%rsi) 17389; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 17390; AVX512DQ-NEXT: vmovaps %zmm8, 64(%rsi) 17391; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17392; AVX512DQ-NEXT: vmovaps %zmm6, (%rsi) 17393; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17394; AVX512DQ-NEXT: vmovaps %zmm6, 192(%rdx) 17395; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17396; AVX512DQ-NEXT: vmovaps %zmm6, (%rdx) 17397; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17398; AVX512DQ-NEXT: vmovaps %zmm6, 64(%rdx) 17399; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17400; AVX512DQ-NEXT: vmovaps %zmm6, 128(%rdx) 17401; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17402; AVX512DQ-NEXT: vmovaps %zmm6, 192(%rcx) 17403; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17404; AVX512DQ-NEXT: vmovaps %zmm6, (%rcx) 17405; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17406; AVX512DQ-NEXT: vmovaps %zmm6, 64(%rcx) 17407; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17408; AVX512DQ-NEXT: vmovaps %zmm6, 128(%rcx) 17409; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17410; AVX512DQ-NEXT: vmovaps %zmm6, 192(%r8) 17411; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17412; AVX512DQ-NEXT: vmovaps %zmm6, (%r8) 17413; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17414; AVX512DQ-NEXT: vmovaps %zmm6, 64(%r8) 17415; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17416; AVX512DQ-NEXT: vmovaps %zmm6, 128(%r8) 17417; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17418; AVX512DQ-NEXT: vmovaps %zmm6, 192(%r9) 17419; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17420; AVX512DQ-NEXT: vmovaps %zmm6, (%r9) 17421; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17422; AVX512DQ-NEXT: vmovaps %zmm6, 64(%r9) 17423; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17424; AVX512DQ-NEXT: vmovaps %zmm6, 128(%r9) 17425; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 17426; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17427; AVX512DQ-NEXT: vmovaps %zmm6, 192(%rax) 17428; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17429; AVX512DQ-NEXT: vmovaps %zmm6, (%rax) 17430; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17431; AVX512DQ-NEXT: vmovaps %zmm6, 64(%rax) 17432; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17433; AVX512DQ-NEXT: vmovaps %zmm6, 128(%rax) 17434; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 17435; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rax) 17436; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) 17437; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax) 17438; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) 17439; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 17440; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rax) 17441; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%rax) 17442; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rax) 17443; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rax) 17444; AVX512DQ-NEXT: addq $3144, %rsp # imm = 0xC48 17445; AVX512DQ-NEXT: vzeroupper 17446; AVX512DQ-NEXT: retq 17447; 17448; AVX512DQ-FCP-LABEL: load_i32_stride8_vf64: 17449; AVX512DQ-FCP: # %bb.0: 17450; AVX512DQ-FCP-NEXT: subq $3144, %rsp # imm = 0xC48 17451; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11 17452; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17453; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 17454; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 17455; AVX512DQ-FCP-NEXT: vmovaps 1536(%rdi), %zmm0 17456; AVX512DQ-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill 17457; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24 17458; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17459; AVX512DQ-FCP-NEXT: vmovaps 1664(%rdi), %zmm0 17460; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17461; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21 17462; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 17463; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22 17464; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 17465; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 17466; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 17467; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm30 17468; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 17469; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm29 17470; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 17471; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 17472; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 17473; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 17474; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 17475; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 17476; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17477; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 17478; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 17479; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17480; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 17481; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 17482; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 17483; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 17484; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17485; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 17486; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 17487; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 17488; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 17489; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 17490; AVX512DQ-FCP-NEXT: movb $-64, %al 17491; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 17492; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17493; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 17494; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 17495; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 17496; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 17497; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 17498; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 17499; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17500; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17501; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17502; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 17503; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 17504; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17505; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 17506; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 17507; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 17508; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17509; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 17510; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 17511; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 17512; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 17513; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 17514; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 17515; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17516; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17517; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17518; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 17519; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17520; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 17521; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 17522; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 17523; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17524; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 17525; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 17526; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17527; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 17528; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 17529; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 17530; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload 17531; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 17532; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 17533; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 17534; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17535; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 17536; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17537; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17538; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17539; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 17540; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 17541; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 17542; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 17543; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 17544; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17545; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 17546; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17547; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 17548; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17549; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 17550; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 17551; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm19 17552; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 17553; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17554; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17555; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 17556; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17557; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 17558; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17559; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 17560; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 17561; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 17562; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17563; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 17564; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17565; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17566; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 17567; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm31 17568; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 17569; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 17570; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17571; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 17572; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 17573; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17574; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17575; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17576; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17577; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 17578; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 17579; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 17580; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 17581; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 17582; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17583; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17584; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 17585; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload 17586; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 17587; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17588; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 17589; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 17590; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17591; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17592; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17593; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 17594; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 17595; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 17596; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 17597; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17598; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 17599; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 17600; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 17601; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 17602; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 17603; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17604; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17605; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17606; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17607; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 17608; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 17609; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 17610; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 17611; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 17612; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17613; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 17614; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 17615; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 17616; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 17617; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 17618; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17619; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 17620; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17621; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 17622; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17623; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 17624; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 17625; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 17626; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 17627; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 17628; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17629; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 17630; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 17631; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] 17632; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17633; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17634; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 17635; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 17636; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 17637; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 17638; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17639; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 17640; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 17641; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 17642; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 17643; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] 17644; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17645; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17646; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 17647; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 17648; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 17649; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 17650; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17651; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 17652; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm7 17653; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 17654; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17655; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 17656; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 17657; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 17658; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 17659; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload 17660; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 17661; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm15 17662; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 17663; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17664; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17665; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17666; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 17667; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 17668; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 17669; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 17670; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 17671; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17672; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 17673; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 17674; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 17675; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 17676; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 17677; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 17678; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17679; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 17680; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17681; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 17682; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17683; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 17684; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 17685; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 17686; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 17687; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 17688; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17689; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 17690; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm21 17691; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 17692; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 17693; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 17694; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17695; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17696; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17697; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 17698; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 17699; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17700; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 17701; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 17702; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 17703; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm31 17704; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17705; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 17706; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 17707; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 17708; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 17709; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17710; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17711; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17712; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 17713; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 17714; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 17715; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 17716; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 17717; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17718; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 17719; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 17720; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 17721; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 17722; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 17723; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17724; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17725; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17726; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 17727; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 17728; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 17729; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 17730; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 17731; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 17732; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17733; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 17734; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 17735; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 17736; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 17737; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm24 17738; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 17739; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17740; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 17741; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17742; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 17743; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17744; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 17745; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17746; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 17747; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17748; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 17749; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 17750; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 17751; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17752; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 17753; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 17754; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 17755; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17756; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 17757; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 17758; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17759; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 17760; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17761; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17762; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17763; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 17764; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 17765; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 17766; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 17767; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17768; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 17769; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 17770; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 17771; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 17772; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 17773; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm11 17774; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 17775; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17776; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17777; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17778; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload 17779; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 17780; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 17781; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 17782; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 17783; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 17784; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 17785; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 17786; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17787; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 17788; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 17789; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 17790; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload 17791; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 17792; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 17793; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17794; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17795; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17796; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17797; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload 17798; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 17799; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 17800; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 17801; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 17802; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 17803; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17804; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 17805; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 17806; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 17807; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 17808; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 17809; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17810; AVX512DQ-FCP-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 17811; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17812; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 17813; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17814; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 17815; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17816; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 17817; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 17818; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} 17819; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 17820; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 17821; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 17822; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 17823; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17824; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 17825; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17826; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload 17827; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 17828; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 17829; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 17830; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 17831; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 17832; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 17833; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17834; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 17835; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 17836; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 17837; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 17838; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 17839; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 17840; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 17841; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17842; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17843; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17844; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 17845; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 17846; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm16 17847; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 17848; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm27 17849; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 17850; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 17851; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17852; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 17853; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 17854; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 17855; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 17856; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload 17857; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 17858; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 17859; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 17860; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 17861; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17862; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 17863; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 17864; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 17865; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 17866; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 17867; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 17868; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 17869; AVX512DQ-FCP-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 17870; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 17871; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 17872; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17873; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 17874; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17875; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 17876; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 17877; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 17878; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 17879; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17880; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 17881; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 17882; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 17883; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 17884; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 17885; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 17886; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 17887; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 17888; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17889; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 17890; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 17891; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 17892; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 17893; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17894; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 17895; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 17896; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 17897; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 17898; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm29 17899; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 17900; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 17901; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 17902; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 17903; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 17904; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 17905; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 17906; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17907; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 17908; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 17909; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 17910; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 17911; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 17912; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 17913; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 17914; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 17915; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 17916; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 17917; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 17918; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 17919; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 17920; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 17921; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 17922; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 17923; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 17924; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 17925; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 17926; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 17927; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 17928; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 17929; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 17930; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 17931; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17932; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 17933; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 17934; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 17935; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 17936; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 17937; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 17938; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 17939; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 17940; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 17941; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 17942; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 17943; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 17944; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 17945; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 17946; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 17947; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17948; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 17949; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 17950; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 17951; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] 17952; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} 17953; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 17954; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] 17955; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} 17956; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 17957; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] 17958; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} 17959; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 17960; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} 17961; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] 17962; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 17963; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} 17964; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 17965; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 17966; AVX512DQ-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 17967; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 17968; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} 17969; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload 17970; AVX512DQ-FCP-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] 17971; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 17972; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} 17973; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] 17974; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 17975; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} 17976; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] 17977; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 17978; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 17979; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 192(%rsi) 17980; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 17981; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 128(%rsi) 17982; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 17983; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 64(%rsi) 17984; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17985; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%rsi) 17986; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17987; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 192(%rdx) 17988; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17989; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%rdx) 17990; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17991; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 64(%rdx) 17992; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17993; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 128(%rdx) 17994; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17995; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 192(%rcx) 17996; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17997; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%rcx) 17998; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 17999; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 64(%rcx) 18000; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18001; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 128(%rcx) 18002; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18003; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 192(%r8) 18004; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18005; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%r8) 18006; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18007; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 64(%r8) 18008; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18009; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 128(%r8) 18010; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18011; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 192(%r9) 18012; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18013; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%r9) 18014; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18015; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 64(%r9) 18016; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18017; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 128(%r9) 18018; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 18019; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18020; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 192(%rax) 18021; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18022; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%rax) 18023; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18024; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 64(%rax) 18025; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18026; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 128(%rax) 18027; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 18028; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) 18029; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) 18030; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) 18031; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) 18032; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 18033; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) 18034; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax) 18035; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax) 18036; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) 18037; AVX512DQ-FCP-NEXT: addq $3144, %rsp # imm = 0xC48 18038; AVX512DQ-FCP-NEXT: vzeroupper 18039; AVX512DQ-FCP-NEXT: retq 18040; 18041; AVX512BW-LABEL: load_i32_stride8_vf64: 18042; AVX512BW: # %bb.0: 18043; AVX512BW-NEXT: subq $3144, %rsp # imm = 0xC48 18044; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm11 18045; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18046; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm18 18047; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm31 18048; AVX512BW-NEXT: vmovaps 1536(%rdi), %zmm0 18049; AVX512BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill 18050; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm24 18051; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18052; AVX512BW-NEXT: vmovaps 1664(%rdi), %zmm0 18053; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18054; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm21 18055; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm26 18056; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm22 18057; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm5 18058; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 18059; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 18060; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm30 18061; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 18062; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm29 18063; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm27 18064; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm20 18065; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm10 18066; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm25 18067; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm7 18068; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm9 18069; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18070; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm12 18071; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 18072; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18073; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm28 18074; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm23 18075; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4 18076; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 18077; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18078; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 18079; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 18080; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 18081; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 18082; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 18083; AVX512BW-NEXT: movb $-64, %al 18084; AVX512BW-NEXT: kmovd %eax, %k1 18085; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18086; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 18087; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 18088; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 18089; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 18090; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 18091; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 18092; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18093; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18094; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18095; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 18096; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 18097; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18098; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 18099; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 18100; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 18101; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18102; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 18103; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 18104; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 18105; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 18106; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 18107; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 18108; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18109; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18110; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18111; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 18112; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18113; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 18114; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 18115; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17 18116; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18117; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 18118; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 18119; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18120; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 18121; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 18122; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 18123; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload 18124; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 18125; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm24 18126; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 18127; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18128; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5 18129; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18130; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18131; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18132; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 18133; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 18134; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm22 18135; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 18136; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 18137; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18138; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 18139; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18140; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 18141; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18142; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 18143; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 18144; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 18145; AVX512BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 18146; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18147; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 18148; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 18149; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18150; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 18151; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18152; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 18153; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 18154; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 18155; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18156; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 18157; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18158; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18159; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 18160; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm31 18161; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 18162; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12 18163; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18164; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm30 18165; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 18166; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18167; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18168; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18169; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18170; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 18171; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 18172; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 18173; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 18174; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 18175; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18176; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18177; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 18178; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload 18179; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 18180; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18181; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 18182; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 18183; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18184; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18185; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18186; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 18187; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 18188; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 18189; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 18190; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18191; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 18192; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 18193; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 18194; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 18195; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 18196; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18197; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18198; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18199; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18200; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 18201; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 18202; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 18203; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 18204; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 18205; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18206; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 18207; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 18208; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 18209; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 18210; AVX512BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 18211; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 18212; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 18213; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18214; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 18215; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18216; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 18217; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 18218; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 18219; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 18220; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 18221; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18222; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 18223; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 18224; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] 18225; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18226; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18227; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 18228; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 18229; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 18230; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 18231; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18232; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 18233; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm16 18234; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 18235; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm29 18236; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] 18237; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18238; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18239; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 18240; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 18241; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 18242; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 18243; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18244; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 18245; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm7 18246; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 18247; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18248; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 18249; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 18250; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm25 18251; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 18252; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload 18253; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 18254; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm15 18255; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 18256; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18257; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18258; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18259; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 18260; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 18261; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 18262; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 18263; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 18264; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18265; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 18266; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 18267; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 18268; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm24 18269; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 18270; AVX512BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 18271; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 18272; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 18273; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18274; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 18275; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18276; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 18277; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 18278; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 18279; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 18280; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 18281; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18282; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 18283; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm21 18284; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 18285; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 18286; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 18287; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18288; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18289; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18290; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 18291; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 18292; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18293; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 18294; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 18295; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 18296; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm31 18297; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18298; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 18299; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 18300; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 18301; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 18302; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18303; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18304; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18305; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 18306; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 18307; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 18308; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 18309; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 18310; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18311; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 18312; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 18313; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 18314; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 18315; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 18316; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18317; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18318; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18319; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 18320; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 18321; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm25 18322; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 18323; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 18324; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 18325; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18326; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 18327; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm18 18328; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 18329; AVX512BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 18330; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm24 18331; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 18332; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 18333; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 18334; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18335; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 18336; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18337; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 18338; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18339; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 18340; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18341; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 18342; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 18343; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 18344; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18345; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 18346; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 18347; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 18348; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18349; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 18350; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 18351; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18352; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 18353; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18354; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18355; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18356; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 18357; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 18358; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 18359; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 18360; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18361; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 18362; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 18363; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 18364; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 18365; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 18366; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm11 18367; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 18368; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18369; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18370; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18371; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload 18372; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 18373; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 18374; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm31 18375; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 18376; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 18377; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm19 18378; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 18379; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18380; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 18381; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 18382; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 18383; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload 18384; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 18385; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm29 18386; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18387; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18388; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18389; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18390; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload 18391; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 18392; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 18393; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 18394; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 18395; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 18396; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18397; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 18398; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 18399; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 18400; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 18401; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm13 18402; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18403; AVX512BW-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 18404; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 18405; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 18406; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18407; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 18408; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18409; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 18410; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 18411; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} 18412; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 18413; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 18414; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 18415; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 18416; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18417; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 18418; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18419; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload 18420; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 18421; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 18422; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 18423; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 18424; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 18425; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 18426; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18427; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 18428; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 18429; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 18430; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 18431; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 18432; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 18433; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 18434; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18435; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18436; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18437; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 18438; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 18439; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm16 18440; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 18441; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm27 18442; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 18443; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm20 18444; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18445; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 18446; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 18447; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 18448; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 18449; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload 18450; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 18451; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 18452; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18453; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18454; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18455; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 18456; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 18457; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 18458; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 18459; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18460; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 18461; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 18462; AVX512BW-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 18463; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 18464; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 18465; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18466; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 18467; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18468; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm4 18469; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 18470; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 18471; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 18472; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18473; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 18474; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm19 18475; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm11 18476; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 18477; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 18478; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 18479; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 18480; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 18481; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18482; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 18483; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 18484; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 18485; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 18486; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18487; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm31 18488; AVX512BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 18489; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 18490; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 18491; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm29 18492; AVX512BW-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 18493; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 18494; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 18495; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 18496; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 18497; AVX512BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 18498; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 18499; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18500; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 18501; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 18502; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 18503; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 18504; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 18505; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 18506; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 18507; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 18508; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 18509; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 18510; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 18511; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 18512; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm24 18513; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 18514; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 18515; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 18516; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm27 18517; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 18518; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 18519; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 18520; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 18521; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 18522; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 18523; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 18524; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18525; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 18526; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 18527; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 18528; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 18529; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 18530; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 18531; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 18532; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm25 18533; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 18534; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 18535; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 18536; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 18537; AVX512BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 18538; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 18539; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 18540; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18541; AVX512BW-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 18542; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 18543; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 18544; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] 18545; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} 18546; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 18547; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] 18548; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} 18549; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 18550; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] 18551; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} 18552; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 18553; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} 18554; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] 18555; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 18556; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} 18557; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 18558; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 18559; AVX512BW-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 18560; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 18561; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} 18562; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload 18563; AVX512BW-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] 18564; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 18565; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} 18566; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] 18567; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 18568; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} 18569; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] 18570; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 18571; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 18572; AVX512BW-NEXT: vmovaps %zmm8, 192(%rsi) 18573; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 18574; AVX512BW-NEXT: vmovaps %zmm8, 128(%rsi) 18575; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 18576; AVX512BW-NEXT: vmovaps %zmm8, 64(%rsi) 18577; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18578; AVX512BW-NEXT: vmovaps %zmm6, (%rsi) 18579; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18580; AVX512BW-NEXT: vmovaps %zmm6, 192(%rdx) 18581; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18582; AVX512BW-NEXT: vmovaps %zmm6, (%rdx) 18583; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18584; AVX512BW-NEXT: vmovaps %zmm6, 64(%rdx) 18585; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18586; AVX512BW-NEXT: vmovaps %zmm6, 128(%rdx) 18587; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18588; AVX512BW-NEXT: vmovaps %zmm6, 192(%rcx) 18589; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18590; AVX512BW-NEXT: vmovaps %zmm6, (%rcx) 18591; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18592; AVX512BW-NEXT: vmovaps %zmm6, 64(%rcx) 18593; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18594; AVX512BW-NEXT: vmovaps %zmm6, 128(%rcx) 18595; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18596; AVX512BW-NEXT: vmovaps %zmm6, 192(%r8) 18597; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18598; AVX512BW-NEXT: vmovaps %zmm6, (%r8) 18599; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18600; AVX512BW-NEXT: vmovaps %zmm6, 64(%r8) 18601; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18602; AVX512BW-NEXT: vmovaps %zmm6, 128(%r8) 18603; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18604; AVX512BW-NEXT: vmovaps %zmm6, 192(%r9) 18605; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18606; AVX512BW-NEXT: vmovaps %zmm6, (%r9) 18607; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18608; AVX512BW-NEXT: vmovaps %zmm6, 64(%r9) 18609; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18610; AVX512BW-NEXT: vmovaps %zmm6, 128(%r9) 18611; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 18612; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18613; AVX512BW-NEXT: vmovaps %zmm6, 192(%rax) 18614; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18615; AVX512BW-NEXT: vmovaps %zmm6, (%rax) 18616; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18617; AVX512BW-NEXT: vmovaps %zmm6, 64(%rax) 18618; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18619; AVX512BW-NEXT: vmovaps %zmm6, 128(%rax) 18620; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 18621; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax) 18622; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) 18623; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) 18624; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) 18625; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 18626; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rax) 18627; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) 18628; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) 18629; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) 18630; AVX512BW-NEXT: addq $3144, %rsp # imm = 0xC48 18631; AVX512BW-NEXT: vzeroupper 18632; AVX512BW-NEXT: retq 18633; 18634; AVX512BW-FCP-LABEL: load_i32_stride8_vf64: 18635; AVX512BW-FCP: # %bb.0: 18636; AVX512BW-FCP-NEXT: subq $3144, %rsp # imm = 0xC48 18637; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11 18638; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18639; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 18640; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 18641; AVX512BW-FCP-NEXT: vmovaps 1536(%rdi), %zmm0 18642; AVX512BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill 18643; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24 18644; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18645; AVX512BW-FCP-NEXT: vmovaps 1664(%rdi), %zmm0 18646; AVX512BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18647; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21 18648; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 18649; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22 18650; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 18651; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 18652; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 18653; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm30 18654; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 18655; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm29 18656; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 18657; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 18658; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 18659; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 18660; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 18661; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 18662; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18663; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 18664; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 18665; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18666; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 18667; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 18668; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 18669; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 18670; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18671; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 18672; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 18673; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 18674; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 18675; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 18676; AVX512BW-FCP-NEXT: movb $-64, %al 18677; AVX512BW-FCP-NEXT: kmovd %eax, %k1 18678; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18679; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 18680; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 18681; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 18682; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 18683; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 18684; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 18685; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18686; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18687; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18688; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 18689; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 18690; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18691; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 18692; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 18693; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 18694; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18695; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 18696; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 18697; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 18698; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 18699; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 18700; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 18701; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18702; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18703; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18704; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 18705; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18706; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 18707; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 18708; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 18709; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18710; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 18711; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 18712; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18713; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 18714; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 18715; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 18716; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload 18717; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 18718; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 18719; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 18720; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18721; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 18722; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18723; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18724; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18725; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 18726; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 18727; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 18728; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 18729; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 18730; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18731; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 18732; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18733; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 18734; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18735; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 18736; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 18737; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm19 18738; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 18739; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18740; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 18741; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 18742; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18743; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 18744; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18745; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 18746; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 18747; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 18748; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18749; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 18750; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18751; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18752; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 18753; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm31 18754; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 18755; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 18756; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18757; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 18758; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 18759; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18760; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18761; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18762; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18763; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 18764; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 18765; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 18766; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 18767; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 18768; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18769; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18770; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 18771; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload 18772; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 18773; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18774; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 18775; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 18776; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18777; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18778; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18779; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 18780; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 18781; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 18782; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 18783; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18784; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 18785; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 18786; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 18787; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 18788; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 18789; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18790; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18791; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18792; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 18793; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 18794; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 18795; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 18796; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 18797; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 18798; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18799; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 18800; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 18801; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 18802; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 18803; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 18804; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 18805; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 18806; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18807; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 18808; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18809; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 18810; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 18811; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 18812; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 18813; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 18814; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18815; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 18816; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 18817; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] 18818; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18819; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18820; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 18821; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 18822; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 18823; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 18824; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18825; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 18826; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 18827; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 18828; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 18829; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] 18830; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18831; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18832; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 18833; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 18834; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 18835; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 18836; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18837; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 18838; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm7 18839; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 18840; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18841; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 18842; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 18843; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 18844; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 18845; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload 18846; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 18847; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm15 18848; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 18849; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18850; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18851; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18852; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 18853; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 18854; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 18855; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 18856; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 18857; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18858; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 18859; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 18860; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 18861; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 18862; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 18863; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 18864; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 18865; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 18866; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18867; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 18868; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18869; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 18870; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 18871; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 18872; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 18873; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 18874; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18875; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 18876; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm21 18877; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 18878; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 18879; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 18880; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18881; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18882; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18883; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 18884; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 18885; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18886; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 18887; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 18888; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 18889; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm31 18890; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18891; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 18892; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 18893; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 18894; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 18895; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18896; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18897; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18898; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 18899; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 18900; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 18901; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 18902; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 18903; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18904; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 18905; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 18906; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 18907; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 18908; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 18909; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18910; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18911; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18912; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 18913; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 18914; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 18915; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 18916; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 18917; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 18918; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18919; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 18920; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 18921; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 18922; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 18923; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm24 18924; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 18925; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 18926; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 18927; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18928; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 18929; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 18930; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 18931; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18932; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 18933; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18934; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 18935; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 18936; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 18937; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18938; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 18939; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 18940; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 18941; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18942; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 18943; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 18944; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18945; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 18946; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18947; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18948; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18949; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 18950; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 18951; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 18952; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 18953; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18954; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 18955; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 18956; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 18957; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 18958; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 18959; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm11 18960; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 18961; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18962; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18963; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18964; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload 18965; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 18966; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 18967; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 18968; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 18969; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 18970; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 18971; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 18972; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18973; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 18974; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 18975; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 18976; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload 18977; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 18978; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 18979; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18980; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 18981; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 18982; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18983; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload 18984; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 18985; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 18986; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 18987; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 18988; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 18989; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 18990; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 18991; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 18992; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 18993; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 18994; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 18995; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18996; AVX512BW-FCP-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 18997; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 18998; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 18999; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19000; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 19001; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19002; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 19003; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 19004; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} 19005; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 19006; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 19007; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 19008; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 19009; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19010; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 19011; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19012; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload 19013; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 19014; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19015; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 19016; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 19017; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 19018; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 19019; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19020; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 19021; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 19022; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 19023; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 19024; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 19025; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 19026; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 19027; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19028; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19029; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19030; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 19031; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 19032; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm16 19033; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 19034; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm27 19035; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 19036; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 19037; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19038; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 19039; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 19040; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 19041; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 19042; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload 19043; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 19044; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 19045; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19046; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19047; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19048; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 19049; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 19050; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 19051; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 19052; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19053; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 19054; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 19055; AVX512BW-FCP-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 19056; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 19057; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 19058; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19059; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 19060; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19061; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 19062; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 19063; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 19064; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 19065; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19066; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 19067; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 19068; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 19069; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 19070; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 19071; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 19072; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 19073; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 19074; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19075; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19076; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 19077; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 19078; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 19079; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19080; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 19081; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 19082; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 19083; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 19084; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm29 19085; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 19086; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 19087; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 19088; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 19089; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 19090; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 19091; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 19092; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19093; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 19094; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 19095; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 19096; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 19097; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 19098; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19099; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 19100; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 19101; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 19102; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 19103; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 19104; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19105; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 19106; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 19107; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 19108; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 19109; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 19110; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 19111; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 19112; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19113; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 19114; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 19115; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 19116; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 19117; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19118; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 19119; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 19120; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 19121; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 19122; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 19123; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 19124; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 19125; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 19126; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 19127; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 19128; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 19129; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 19130; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 19131; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 19132; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 19133; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19134; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 19135; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 19136; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 19137; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] 19138; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} 19139; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 19140; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] 19141; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} 19142; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 19143; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] 19144; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} 19145; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 19146; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} 19147; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] 19148; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 19149; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} 19150; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 19151; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 19152; AVX512BW-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 19153; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 19154; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} 19155; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload 19156; AVX512BW-FCP-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] 19157; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 19158; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} 19159; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] 19160; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 19161; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} 19162; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] 19163; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 19164; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19165; AVX512BW-FCP-NEXT: vmovaps %zmm8, 192(%rsi) 19166; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19167; AVX512BW-FCP-NEXT: vmovaps %zmm8, 128(%rsi) 19168; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19169; AVX512BW-FCP-NEXT: vmovaps %zmm8, 64(%rsi) 19170; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19171; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%rsi) 19172; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19173; AVX512BW-FCP-NEXT: vmovaps %zmm6, 192(%rdx) 19174; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19175; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%rdx) 19176; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19177; AVX512BW-FCP-NEXT: vmovaps %zmm6, 64(%rdx) 19178; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19179; AVX512BW-FCP-NEXT: vmovaps %zmm6, 128(%rdx) 19180; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19181; AVX512BW-FCP-NEXT: vmovaps %zmm6, 192(%rcx) 19182; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19183; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%rcx) 19184; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19185; AVX512BW-FCP-NEXT: vmovaps %zmm6, 64(%rcx) 19186; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19187; AVX512BW-FCP-NEXT: vmovaps %zmm6, 128(%rcx) 19188; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19189; AVX512BW-FCP-NEXT: vmovaps %zmm6, 192(%r8) 19190; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19191; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%r8) 19192; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19193; AVX512BW-FCP-NEXT: vmovaps %zmm6, 64(%r8) 19194; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19195; AVX512BW-FCP-NEXT: vmovaps %zmm6, 128(%r8) 19196; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19197; AVX512BW-FCP-NEXT: vmovaps %zmm6, 192(%r9) 19198; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19199; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%r9) 19200; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19201; AVX512BW-FCP-NEXT: vmovaps %zmm6, 64(%r9) 19202; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19203; AVX512BW-FCP-NEXT: vmovaps %zmm6, 128(%r9) 19204; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 19205; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19206; AVX512BW-FCP-NEXT: vmovaps %zmm6, 192(%rax) 19207; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19208; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%rax) 19209; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19210; AVX512BW-FCP-NEXT: vmovaps %zmm6, 64(%rax) 19211; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19212; AVX512BW-FCP-NEXT: vmovaps %zmm6, 128(%rax) 19213; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 19214; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) 19215; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) 19216; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) 19217; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) 19218; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 19219; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) 19220; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax) 19221; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) 19222; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) 19223; AVX512BW-FCP-NEXT: addq $3144, %rsp # imm = 0xC48 19224; AVX512BW-FCP-NEXT: vzeroupper 19225; AVX512BW-FCP-NEXT: retq 19226; 19227; AVX512DQ-BW-LABEL: load_i32_stride8_vf64: 19228; AVX512DQ-BW: # %bb.0: 19229; AVX512DQ-BW-NEXT: subq $3144, %rsp # imm = 0xC48 19230; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm11 19231; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19232; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm18 19233; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm31 19234; AVX512DQ-BW-NEXT: vmovaps 1536(%rdi), %zmm0 19235; AVX512DQ-BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill 19236; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm24 19237; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19238; AVX512DQ-BW-NEXT: vmovaps 1664(%rdi), %zmm0 19239; AVX512DQ-BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19240; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm21 19241; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm26 19242; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm22 19243; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm5 19244; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 19245; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 19246; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm30 19247; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 19248; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm29 19249; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm27 19250; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm20 19251; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm10 19252; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm25 19253; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm7 19254; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm9 19255; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19256; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm12 19257; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm6 19258; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19259; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm28 19260; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm23 19261; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm4 19262; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 19263; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19264; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 19265; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm16 19266; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 19267; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 19268; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 19269; AVX512DQ-BW-NEXT: movb $-64, %al 19270; AVX512DQ-BW-NEXT: kmovd %eax, %k1 19271; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19272; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1 19273; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15 19274; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 19275; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 19276; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 19277; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 19278; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19279; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19280; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19281; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 19282; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm6 19283; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19284; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 19285; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 19286; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 19287; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19288; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 19289; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 19290; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 19291; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 19292; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 19293; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 19294; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19295; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19296; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19297; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14 19298; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19299; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 19300; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 19301; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm17 19302; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19303; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 19304; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 19305; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19306; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19307; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 19308; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 19309; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload 19310; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 19311; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm24 19312; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 19313; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19314; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm5 19315; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19316; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19317; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19318; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 19319; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 19320; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm22 19321; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 19322; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 19323; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19324; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 19325; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19326; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5 19327; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19328; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 19329; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm7 19330; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm19 19331; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 19332; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19333; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 19334; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 19335; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19336; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 19337; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19338; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 19339; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 19340; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 19341; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19342; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 19343; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19344; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19345; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 19346; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm31 19347; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 19348; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm12 19349; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19350; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm30 19351; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 19352; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19353; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19354; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19355; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19356; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 19357; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 19358; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 19359; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 19360; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 19361; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19362; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19363; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 19364; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload 19365; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 19366; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19367; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm12 19368; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 19369; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19370; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19371; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19372; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 19373; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 19374; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 19375; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 19376; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19377; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 19378; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19379; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 19380; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 19381; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 19382; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19383; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19384; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19385; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19386; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 19387; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 19388; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 19389; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 19390; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 19391; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19392; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 19393; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 19394; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 19395; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 19396; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 19397; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 19398; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 19399; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19400; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 19401; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19402; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 19403; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 19404; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 19405; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 19406; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 19407; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19408; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 19409; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 19410; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] 19411; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19412; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19413; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 19414; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 19415; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 19416; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 19417; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19418; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 19419; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm16 19420; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 19421; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm29 19422; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] 19423; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19424; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19425; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19426; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 19427; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 19428; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 19429; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19430; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 19431; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm7 19432; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 19433; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19434; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 19435; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 19436; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm25 19437; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 19438; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload 19439; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 19440; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm15 19441; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 19442; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19443; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19444; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19445; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 19446; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 19447; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 19448; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 19449; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 19450; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19451; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm6 19452; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 19453; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 19454; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm24 19455; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 19456; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 19457; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 19458; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 19459; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19460; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 19461; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19462; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 19463; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 19464; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 19465; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 19466; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 19467; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19468; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19469; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm21 19470; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 19471; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 19472; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 19473; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19474; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19475; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19476; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 19477; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1 19478; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19479; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 19480; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 19481; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 19482; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm31 19483; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19484; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19485; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 19486; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 19487; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 19488; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19489; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19490; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19491; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 19492; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 19493; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 19494; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 19495; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm26 19496; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19497; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 19498; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm9 19499; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 19500; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 19501; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 19502; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19503; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19504; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19505; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 19506; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 19507; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm25 19508; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 19509; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 19510; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 19511; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19512; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 19513; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm18 19514; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 19515; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 19516; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm24 19517; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm13 19518; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 19519; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 19520; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19521; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 19522; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19523; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 19524; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19525; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 19526; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19527; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 19528; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 19529; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 19530; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19531; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 19532; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 19533; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm6 19534; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19535; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 19536; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 19537; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19538; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 19539; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19540; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19541; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19542; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1 19543; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 19544; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 19545; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 19546; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19547; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19548; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 19549; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 19550; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 19551; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 19552; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm11 19553; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 19554; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19555; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19556; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19557; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload 19558; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1 19559; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 19560; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm31 19561; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 19562; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm4 19563; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm19 19564; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 19565; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19566; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 19567; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 19568; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm26 19569; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload 19570; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 19571; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm29 19572; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19573; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19574; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19575; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19576; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload 19577; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm1 19578; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 19579; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 19580; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14 19581; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 19582; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19583; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 19584; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 19585; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 19586; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm5 19587; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm13 19588; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19589; AVX512DQ-BW-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 19590; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 19591; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 19592; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19593; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 19594; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19595; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 19596; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 19597; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} 19598; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 19599; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 19600; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 19601; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 19602; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19603; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 19604; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19605; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload 19606; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm1 19607; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19608; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 19609; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 19610; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 19611; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 19612; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19613; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 19614; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm15 19615; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 19616; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm8 19617; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 19618; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 19619; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm2 19620; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19621; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19622; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19623; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1 19624; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 19625; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm16 19626; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm4 19627; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm27 19628; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 19629; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm20 19630; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19631; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 19632; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 19633; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 19634; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm17 19635; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload 19636; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 19637; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 19638; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19639; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19640; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19641; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm1 19642; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 19643; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 19644; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 19645; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19646; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 19647; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 19648; AVX512DQ-BW-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 19649; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 19650; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 19651; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19652; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 19653; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19654; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm4 19655; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm30 19656; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 19657; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 19658; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19659; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 19660; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm19 19661; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm11 19662; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 19663; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 19664; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm9 19665; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 19666; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 19667; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19668; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19669; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm8 19670; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 19671; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 19672; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19673; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm31 19674; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 19675; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 19676; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm16 19677; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm29 19678; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 19679; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 19680; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm26 19681; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm2 19682; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm5 19683; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 19684; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 19685; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19686; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4 19687; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 19688; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 19689; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 19690; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm15 19691; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19692; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm23 19693; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 19694; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 19695; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 19696; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm17 19697; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19698; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm24 19699; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 19700; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 19701; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 19702; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm27 19703; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 19704; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 19705; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19706; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 19707; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 19708; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 19709; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm3 19710; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19711; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 19712; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 19713; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm6 19714; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm20 19715; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 19716; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 19717; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm21 19718; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm25 19719; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 19720; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 19721; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 19722; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm14 19723; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 19724; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 19725; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 19726; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19727; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 19728; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 19729; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 19730; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] 19731; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} 19732; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 19733; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] 19734; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} 19735; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 19736; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] 19737; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} 19738; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 19739; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} 19740; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] 19741; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 19742; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} 19743; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 19744; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 19745; AVX512DQ-BW-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 19746; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 19747; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} 19748; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload 19749; AVX512DQ-BW-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] 19750; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 19751; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} 19752; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] 19753; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 19754; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} 19755; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] 19756; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 19757; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19758; AVX512DQ-BW-NEXT: vmovaps %zmm8, 192(%rsi) 19759; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19760; AVX512DQ-BW-NEXT: vmovaps %zmm8, 128(%rsi) 19761; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 19762; AVX512DQ-BW-NEXT: vmovaps %zmm8, 64(%rsi) 19763; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19764; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%rsi) 19765; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19766; AVX512DQ-BW-NEXT: vmovaps %zmm6, 192(%rdx) 19767; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19768; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%rdx) 19769; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19770; AVX512DQ-BW-NEXT: vmovaps %zmm6, 64(%rdx) 19771; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19772; AVX512DQ-BW-NEXT: vmovaps %zmm6, 128(%rdx) 19773; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19774; AVX512DQ-BW-NEXT: vmovaps %zmm6, 192(%rcx) 19775; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19776; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%rcx) 19777; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19778; AVX512DQ-BW-NEXT: vmovaps %zmm6, 64(%rcx) 19779; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19780; AVX512DQ-BW-NEXT: vmovaps %zmm6, 128(%rcx) 19781; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19782; AVX512DQ-BW-NEXT: vmovaps %zmm6, 192(%r8) 19783; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19784; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%r8) 19785; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19786; AVX512DQ-BW-NEXT: vmovaps %zmm6, 64(%r8) 19787; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19788; AVX512DQ-BW-NEXT: vmovaps %zmm6, 128(%r8) 19789; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19790; AVX512DQ-BW-NEXT: vmovaps %zmm6, 192(%r9) 19791; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19792; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%r9) 19793; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19794; AVX512DQ-BW-NEXT: vmovaps %zmm6, 64(%r9) 19795; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19796; AVX512DQ-BW-NEXT: vmovaps %zmm6, 128(%r9) 19797; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 19798; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19799; AVX512DQ-BW-NEXT: vmovaps %zmm6, 192(%rax) 19800; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19801; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%rax) 19802; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19803; AVX512DQ-BW-NEXT: vmovaps %zmm6, 64(%rax) 19804; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19805; AVX512DQ-BW-NEXT: vmovaps %zmm6, 128(%rax) 19806; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 19807; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 192(%rax) 19808; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) 19809; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax) 19810; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) 19811; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 19812; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 128(%rax) 19813; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 192(%rax) 19814; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rax) 19815; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rax) 19816; AVX512DQ-BW-NEXT: addq $3144, %rsp # imm = 0xC48 19817; AVX512DQ-BW-NEXT: vzeroupper 19818; AVX512DQ-BW-NEXT: retq 19819; 19820; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf64: 19821; AVX512DQ-BW-FCP: # %bb.0: 19822; AVX512DQ-BW-FCP-NEXT: subq $3144, %rsp # imm = 0xC48 19823; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11 19824; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19825; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 19826; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 19827; AVX512DQ-BW-FCP-NEXT: vmovaps 1536(%rdi), %zmm0 19828; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill 19829; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24 19830; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19831; AVX512DQ-BW-FCP-NEXT: vmovaps 1664(%rdi), %zmm0 19832; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19833; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21 19834; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 19835; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22 19836; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 19837; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 19838; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 19839; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm30 19840; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 19841; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm29 19842; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 19843; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 19844; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 19845; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 19846; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 19847; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 19848; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19849; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 19850; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 19851; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19852; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 19853; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 19854; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 19855; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] 19856; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19857; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 19858; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 19859; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 19860; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 19861; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 19862; AVX512DQ-BW-FCP-NEXT: movb $-64, %al 19863; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 19864; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19865; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 19866; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 19867; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 19868; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 19869; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 19870; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 19871; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19872; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19873; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19874; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 19875; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 19876; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19877; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 19878; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 19879; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 19880; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19881; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 19882; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 19883; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 19884; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 19885; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 19886; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 19887; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19888; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19889; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19890; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 19891; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19892; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 19893; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 19894; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 19895; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19896; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 19897; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 19898; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19899; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19900; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 19901; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 19902; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload 19903; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 19904; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 19905; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 19906; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19907; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 19908; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19909; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19910; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19911; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 19912; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 19913; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 19914; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 19915; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 19916; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19917; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 19918; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19919; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 19920; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19921; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 19922; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 19923; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm19 19924; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 19925; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19926; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 19927; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 19928; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19929; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] 19930; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19931; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 19932; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 19933; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 19934; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19935; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 19936; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19937; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19938; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 19939; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm31 19940; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 19941; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 19942; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19943; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 19944; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 19945; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19946; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19947; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19948; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19949; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 19950; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 19951; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 19952; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 19953; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 19954; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19955; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19956; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 19957; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload 19958; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 19959; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19960; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 19961; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 19962; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19963; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19964; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19965; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 19966; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 19967; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 19968; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 19969; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19970; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 19971; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 19972; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 19973; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 19974; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 19975; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 19976; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 19977; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19978; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 19979; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 19980; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 19981; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 19982; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 19983; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 19984; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 19985; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 19986; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 19987; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 19988; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 19989; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 19990; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 19991; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 19992; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19993; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] 19994; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 19995; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 19996; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 19997; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 19998; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 19999; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 20000; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 20001; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 20002; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 20003; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] 20004; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 20005; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20006; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 20007; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 20008; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 20009; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 20010; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 20011; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 20012; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 20013; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 20014; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 20015; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] 20016; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 20017; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20018; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20019; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 20020; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 20021; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 20022; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20023; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 20024; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm7 20025; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 20026; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 20027; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 20028; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 20029; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 20030; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 20031; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload 20032; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 20033; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm15 20034; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 20035; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 20036; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 20037; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20038; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 20039; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 20040; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 20041; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 20042; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 20043; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 20044; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 20045; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 20046; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 20047; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 20048; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 20049; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 20050; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 20051; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 20052; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20053; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] 20054; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 20055; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 20056; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 20057; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 20058; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 20059; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 20060; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 20061; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20062; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm21 20063; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 20064; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 20065; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 20066; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 20067; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 20068; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20069; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 20070; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 20071; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20072; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 20073; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 20074; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 20075; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm31 20076; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 20077; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20078; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 20079; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 20080; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 20081; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 20082; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 20083; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20084; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 20085; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 20086; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 20087; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 20088; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 20089; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 20090; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 20091; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 20092; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 20093; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 20094; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 20095; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 20096; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 20097; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20098; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 20099; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 20100; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 20101; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 20102; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 20103; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 20104; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 20105; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 20106; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 20107; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 20108; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 20109; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm24 20110; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 20111; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 20112; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 20113; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20114; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] 20115; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 20116; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 20117; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20118; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 20119; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20120; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 20121; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 20122; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 20123; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 20124; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 20125; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 20126; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 20127; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20128; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 20129; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 20130; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20131; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 20132; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 20133; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 20134; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20135; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 20136; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 20137; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 20138; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 20139; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 20140; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20141; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 20142; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 20143; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 20144; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 20145; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm11 20146; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 20147; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 20148; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 20149; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20150; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload 20151; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 20152; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 20153; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 20154; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 20155; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 20156; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 20157; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 20158; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 20159; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 20160; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 20161; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 20162; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload 20163; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 20164; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 20165; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20166; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 20167; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 20168; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20169; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload 20170; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 20171; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 20172; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 20173; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 20174; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 20175; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 20176; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 20177; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 20178; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 20179; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 20180; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 20181; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20182; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 20183; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 20184; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 20185; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20186; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] 20187; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 20188; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 20189; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 20190; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} 20191; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 20192; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 20193; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 20194; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 20195; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 20196; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 20197; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20198; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload 20199; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 20200; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 20201; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 20202; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 20203; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 20204; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 20205; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 20206; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 20207; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 20208; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 20209; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 20210; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 20211; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 20212; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 20213; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 20214; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 20215; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20216; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 20217; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 20218; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm16 20219; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 20220; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm27 20221; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 20222; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 20223; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 20224; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 20225; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 20226; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 20227; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 20228; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload 20229; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 20230; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 20231; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 20232; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 20233; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20234; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 20235; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 20236; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 20237; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 20238; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} 20239; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 20240; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 20241; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 20242; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 20243; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 20244; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20245; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] 20246; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 20247; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 20248; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 20249; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 20250; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] 20251; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 20252; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 20253; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 20254; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 20255; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 20256; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 20257; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 20258; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 20259; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 20260; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20261; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 20262; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 20263; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 20264; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 20265; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20266; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 20267; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 20268; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 20269; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 20270; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm29 20271; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 20272; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 20273; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 20274; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 20275; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 20276; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 20277; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 20278; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 20279; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 20280; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 20281; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 20282; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 20283; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 20284; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 20285; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 20286; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 20287; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 20288; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 20289; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 20290; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 20291; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 20292; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 20293; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 20294; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 20295; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 20296; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 20297; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 20298; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 20299; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 20300; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 20301; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 20302; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 20303; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20304; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 20305; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 20306; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 20307; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 20308; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 20309; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 20310; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 20311; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 20312; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 20313; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 20314; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 20315; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 20316; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 20317; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 20318; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 20319; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20320; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 20321; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 20322; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 20323; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] 20324; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} 20325; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 20326; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] 20327; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} 20328; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 20329; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] 20330; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} 20331; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 20332; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} 20333; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] 20334; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 20335; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} 20336; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 20337; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 20338; AVX512DQ-BW-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] 20339; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 20340; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} 20341; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload 20342; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] 20343; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 20344; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} 20345; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] 20346; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 20347; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} 20348; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] 20349; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 20350; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20351; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 192(%rsi) 20352; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20353; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 128(%rsi) 20354; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 20355; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 64(%rsi) 20356; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20357; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%rsi) 20358; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20359; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 192(%rdx) 20360; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20361; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%rdx) 20362; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20363; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 64(%rdx) 20364; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20365; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 128(%rdx) 20366; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20367; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 192(%rcx) 20368; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20369; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%rcx) 20370; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20371; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 64(%rcx) 20372; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20373; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 128(%rcx) 20374; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20375; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 192(%r8) 20376; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20377; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%r8) 20378; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20379; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 64(%r8) 20380; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20381; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 128(%r8) 20382; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20383; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 192(%r9) 20384; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20385; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%r9) 20386; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20387; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 64(%r9) 20388; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20389; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 128(%r9) 20390; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 20391; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20392; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 192(%rax) 20393; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20394; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%rax) 20395; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20396; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 64(%rax) 20397; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 20398; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 128(%rax) 20399; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 20400; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) 20401; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) 20402; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) 20403; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) 20404; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 20405; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) 20406; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax) 20407; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) 20408; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) 20409; AVX512DQ-BW-FCP-NEXT: addq $3144, %rsp # imm = 0xC48 20410; AVX512DQ-BW-FCP-NEXT: vzeroupper 20411; AVX512DQ-BW-FCP-NEXT: retq 20412 %wide.vec = load <512 x i32>, ptr %in.vec, align 64 20413 %strided.vec0 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248, i32 256, i32 264, i32 272, i32 280, i32 288, i32 296, i32 304, i32 312, i32 320, i32 328, i32 336, i32 344, i32 352, i32 360, i32 368, i32 376, i32 384, i32 392, i32 400, i32 408, i32 416, i32 424, i32 432, i32 440, i32 448, i32 456, i32 464, i32 472, i32 480, i32 488, i32 496, i32 504> 20414 %strided.vec1 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249, i32 257, i32 265, i32 273, i32 281, i32 289, i32 297, i32 305, i32 313, i32 321, i32 329, i32 337, i32 345, i32 353, i32 361, i32 369, i32 377, i32 385, i32 393, i32 401, i32 409, i32 417, i32 425, i32 433, i32 441, i32 449, i32 457, i32 465, i32 473, i32 481, i32 489, i32 497, i32 505> 20415 %strided.vec2 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122, i32 130, i32 138, i32 146, i32 154, i32 162, i32 170, i32 178, i32 186, i32 194, i32 202, i32 210, i32 218, i32 226, i32 234, i32 242, i32 250, i32 258, i32 266, i32 274, i32 282, i32 290, i32 298, i32 306, i32 314, i32 322, i32 330, i32 338, i32 346, i32 354, i32 362, i32 370, i32 378, i32 386, i32 394, i32 402, i32 410, i32 418, i32 426, i32 434, i32 442, i32 450, i32 458, i32 466, i32 474, i32 482, i32 490, i32 498, i32 506> 20416 %strided.vec3 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123, i32 131, i32 139, i32 147, i32 155, i32 163, i32 171, i32 179, i32 187, i32 195, i32 203, i32 211, i32 219, i32 227, i32 235, i32 243, i32 251, i32 259, i32 267, i32 275, i32 283, i32 291, i32 299, i32 307, i32 315, i32 323, i32 331, i32 339, i32 347, i32 355, i32 363, i32 371, i32 379, i32 387, i32 395, i32 403, i32 411, i32 419, i32 427, i32 435, i32 443, i32 451, i32 459, i32 467, i32 475, i32 483, i32 491, i32 499, i32 507> 20417 %strided.vec4 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124, i32 132, i32 140, i32 148, i32 156, i32 164, i32 172, i32 180, i32 188, i32 196, i32 204, i32 212, i32 220, i32 228, i32 236, i32 244, i32 252, i32 260, i32 268, i32 276, i32 284, i32 292, i32 300, i32 308, i32 316, i32 324, i32 332, i32 340, i32 348, i32 356, i32 364, i32 372, i32 380, i32 388, i32 396, i32 404, i32 412, i32 420, i32 428, i32 436, i32 444, i32 452, i32 460, i32 468, i32 476, i32 484, i32 492, i32 500, i32 508> 20418 %strided.vec5 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125, i32 133, i32 141, i32 149, i32 157, i32 165, i32 173, i32 181, i32 189, i32 197, i32 205, i32 213, i32 221, i32 229, i32 237, i32 245, i32 253, i32 261, i32 269, i32 277, i32 285, i32 293, i32 301, i32 309, i32 317, i32 325, i32 333, i32 341, i32 349, i32 357, i32 365, i32 373, i32 381, i32 389, i32 397, i32 405, i32 413, i32 421, i32 429, i32 437, i32 445, i32 453, i32 461, i32 469, i32 477, i32 485, i32 493, i32 501, i32 509> 20419 %strided.vec6 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126, i32 134, i32 142, i32 150, i32 158, i32 166, i32 174, i32 182, i32 190, i32 198, i32 206, i32 214, i32 222, i32 230, i32 238, i32 246, i32 254, i32 262, i32 270, i32 278, i32 286, i32 294, i32 302, i32 310, i32 318, i32 326, i32 334, i32 342, i32 350, i32 358, i32 366, i32 374, i32 382, i32 390, i32 398, i32 406, i32 414, i32 422, i32 430, i32 438, i32 446, i32 454, i32 462, i32 470, i32 478, i32 486, i32 494, i32 502, i32 510> 20420 %strided.vec7 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127, i32 135, i32 143, i32 151, i32 159, i32 167, i32 175, i32 183, i32 191, i32 199, i32 207, i32 215, i32 223, i32 231, i32 239, i32 247, i32 255, i32 263, i32 271, i32 279, i32 287, i32 295, i32 303, i32 311, i32 319, i32 327, i32 335, i32 343, i32 351, i32 359, i32 367, i32 375, i32 383, i32 391, i32 399, i32 407, i32 415, i32 423, i32 431, i32 439, i32 447, i32 455, i32 463, i32 471, i32 479, i32 487, i32 495, i32 503, i32 511> 20421 store <64 x i32> %strided.vec0, ptr %out.vec0, align 64 20422 store <64 x i32> %strided.vec1, ptr %out.vec1, align 64 20423 store <64 x i32> %strided.vec2, ptr %out.vec2, align 64 20424 store <64 x i32> %strided.vec3, ptr %out.vec3, align 64 20425 store <64 x i32> %strided.vec4, ptr %out.vec4, align 64 20426 store <64 x i32> %strided.vec5, ptr %out.vec5, align 64 20427 store <64 x i32> %strided.vec6, ptr %out.vec6, align 64 20428 store <64 x i32> %strided.vec7, ptr %out.vec7, align 64 20429 ret void 20430} 20431