1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved loads. 17 18define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { 19; SSE-LABEL: load_i32_stride7_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 22; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 23; SSE-NEXT: movdqa (%rdi), %xmm0 24; SSE-NEXT: movdqa 16(%rdi), %xmm1 25; SSE-NEXT: movdqa 32(%rdi), %xmm2 26; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] 27; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 28; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] 29; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,2,3,3] 30; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 31; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] 32; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] 33; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] 34; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] 35; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 36; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] 37; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] 38; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 39; SSE-NEXT: movdqa 48(%rdi), %xmm2 40; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 41; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 42; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] 43; SSE-NEXT: movq %xmm0, (%rsi) 44; SSE-NEXT: movq %xmm4, (%rdx) 45; SSE-NEXT: movq %xmm5, (%rcx) 46; SSE-NEXT: movq %xmm6, (%r8) 47; SSE-NEXT: movq %xmm1, (%r9) 48; SSE-NEXT: movq %xmm3, (%r10) 49; SSE-NEXT: movq %xmm7, (%rax) 50; SSE-NEXT: retq 51; 52; AVX-LABEL: load_i32_stride7_vf2: 53; AVX: # %bb.0: 54; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 55; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 56; AVX-NEXT: vmovaps (%rdi), %ymm0 57; AVX-NEXT: vmovaps 32(%rdi), %ymm1 58; AVX-NEXT: vmovaps (%rdi), %xmm2 59; AVX-NEXT: vmovaps 16(%rdi), %xmm3 60; AVX-NEXT: vmovaps 32(%rdi), %xmm4 61; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,3,2,3] 62; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0],xmm5[1],xmm2[2,3] 63; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0],xmm2[1],xmm4[2,3] 64; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,0,2,3] 65; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,3,2,3] 66; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] 67; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] 68; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3] 69; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] 70; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] 71; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4] 72; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 73; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] 74; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] 75; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 76; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3] 77; AVX-NEXT: vmovlps %xmm5, (%rsi) 78; AVX-NEXT: vmovlps %xmm6, (%rdx) 79; AVX-NEXT: vmovlps %xmm7, (%rcx) 80; AVX-NEXT: vmovlps %xmm2, (%r8) 81; AVX-NEXT: vmovlps %xmm3, (%r9) 82; AVX-NEXT: vmovlps %xmm4, (%r10) 83; AVX-NEXT: vmovlps %xmm0, (%rax) 84; AVX-NEXT: vzeroupper 85; AVX-NEXT: retq 86; 87; AVX2-LABEL: load_i32_stride7_vf2: 88; AVX2: # %bb.0: 89; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 90; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 91; AVX2-NEXT: vmovaps (%rdi), %ymm0 92; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 93; AVX2-NEXT: vbroadcastss 28(%rdi), %xmm2 94; AVX2-NEXT: vmovaps (%rdi), %xmm3 95; AVX2-NEXT: vmovaps 32(%rdi), %xmm4 96; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 97; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3] 98; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] 99; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm6 100; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] 101; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] 102; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] 103; AVX2-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0] 104; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] 105; AVX2-NEXT: vpermps %ymm7, %ymm4, %ymm4 106; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] 107; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm7 108; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] 109; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 110; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] 111; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 112; AVX2-NEXT: vmovlps %xmm2, (%rsi) 113; AVX2-NEXT: vmovlps %xmm5, (%rdx) 114; AVX2-NEXT: vmovlps %xmm6, (%rcx) 115; AVX2-NEXT: vmovlps %xmm3, (%r8) 116; AVX2-NEXT: vmovlps %xmm4, (%r9) 117; AVX2-NEXT: vmovlps %xmm7, (%r10) 118; AVX2-NEXT: vmovlps %xmm0, (%rax) 119; AVX2-NEXT: vzeroupper 120; AVX2-NEXT: retq 121; 122; AVX2-FP-LABEL: load_i32_stride7_vf2: 123; AVX2-FP: # %bb.0: 124; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 125; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 126; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 127; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1 128; AVX2-FP-NEXT: vbroadcastss 28(%rdi), %xmm2 129; AVX2-FP-NEXT: vmovaps (%rdi), %xmm3 130; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm4 131; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 132; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3] 133; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] 134; AVX2-FP-NEXT: vbroadcastss 8(%rdi), %xmm6 135; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] 136; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] 137; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] 138; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0] 139; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] 140; AVX2-FP-NEXT: vpermps %ymm7, %ymm4, %ymm4 141; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] 142; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm7 143; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] 144; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 145; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] 146; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 147; AVX2-FP-NEXT: vmovlps %xmm2, (%rsi) 148; AVX2-FP-NEXT: vmovlps %xmm5, (%rdx) 149; AVX2-FP-NEXT: vmovlps %xmm6, (%rcx) 150; AVX2-FP-NEXT: vmovlps %xmm3, (%r8) 151; AVX2-FP-NEXT: vmovlps %xmm4, (%r9) 152; AVX2-FP-NEXT: vmovlps %xmm7, (%r10) 153; AVX2-FP-NEXT: vmovlps %xmm0, (%rax) 154; AVX2-FP-NEXT: vzeroupper 155; AVX2-FP-NEXT: retq 156; 157; AVX2-FCP-LABEL: load_i32_stride7_vf2: 158; AVX2-FCP: # %bb.0: 159; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 160; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 161; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 162; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 163; AVX2-FCP-NEXT: vbroadcastss 28(%rdi), %xmm2 164; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm3 165; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm4 166; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 167; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3] 168; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] 169; AVX2-FCP-NEXT: vbroadcastss 8(%rdi), %xmm6 170; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] 171; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] 172; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] 173; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0] 174; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] 175; AVX2-FCP-NEXT: vpermps %ymm7, %ymm4, %ymm4 176; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] 177; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm7 178; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] 179; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 180; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] 181; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 182; AVX2-FCP-NEXT: vmovlps %xmm2, (%rsi) 183; AVX2-FCP-NEXT: vmovlps %xmm5, (%rdx) 184; AVX2-FCP-NEXT: vmovlps %xmm6, (%rcx) 185; AVX2-FCP-NEXT: vmovlps %xmm3, (%r8) 186; AVX2-FCP-NEXT: vmovlps %xmm4, (%r9) 187; AVX2-FCP-NEXT: vmovlps %xmm7, (%r10) 188; AVX2-FCP-NEXT: vmovlps %xmm0, (%rax) 189; AVX2-FCP-NEXT: vzeroupper 190; AVX2-FCP-NEXT: retq 191; 192; AVX512-LABEL: load_i32_stride7_vf2: 193; AVX512: # %bb.0: 194; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 195; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 196; AVX512-NEXT: vmovdqa (%rdi), %xmm0 197; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 198; AVX512-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 199; AVX512-NEXT: vmovd %xmm1, %r11d 200; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 201; AVX512-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 202; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm4 203; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] 204; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 205; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] 206; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] 207; AVX512-NEXT: vpermps (%rdi), %zmm1, %zmm1 208; AVX512-NEXT: vmovaps (%rdi), %ymm5 209; AVX512-NEXT: vmovaps 32(%rdi), %ymm6 210; AVX512-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] 211; AVX512-NEXT: vextractf128 $1, %ymm7, %xmm7 212; AVX512-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] 213; AVX512-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] 214; AVX512-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] 215; AVX512-NEXT: vextractf128 $1, %ymm5, %xmm5 216; AVX512-NEXT: vmovq %xmm2, (%rsi) 217; AVX512-NEXT: vmovq %xmm3, (%rdx) 218; AVX512-NEXT: vmovq %xmm4, (%rcx) 219; AVX512-NEXT: vmovq %xmm0, (%r8) 220; AVX512-NEXT: vmovlps %xmm1, (%r9) 221; AVX512-NEXT: vmovlps %xmm7, (%r10) 222; AVX512-NEXT: vmovlps %xmm5, (%rax) 223; AVX512-NEXT: vzeroupper 224; AVX512-NEXT: retq 225; 226; AVX512-FCP-LABEL: load_i32_stride7_vf2: 227; AVX512-FCP: # %bb.0: 228; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 229; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 230; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm0 231; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 232; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 233; AVX512-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 234; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] 235; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 236; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 237; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] 238; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] 239; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 240; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] 241; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 242; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] 243; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 244; AVX512-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 245; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7] 246; AVX512-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 247; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 248; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) 249; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) 250; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) 251; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) 252; AVX512-FCP-NEXT: vmovlps %xmm1, (%r9) 253; AVX512-FCP-NEXT: vmovq %xmm7, (%r10) 254; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax) 255; AVX512-FCP-NEXT: vzeroupper 256; AVX512-FCP-NEXT: retq 257; 258; AVX512DQ-LABEL: load_i32_stride7_vf2: 259; AVX512DQ: # %bb.0: 260; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 261; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 262; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 263; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1 264; AVX512DQ-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 265; AVX512DQ-NEXT: vmovd %xmm1, %r11d 266; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 267; AVX512DQ-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 268; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm4 269; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] 270; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 271; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] 272; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] 273; AVX512DQ-NEXT: vpermps (%rdi), %zmm1, %zmm1 274; AVX512DQ-NEXT: vmovaps (%rdi), %ymm5 275; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm6 276; AVX512DQ-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] 277; AVX512DQ-NEXT: vextractf128 $1, %ymm7, %xmm7 278; AVX512DQ-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] 279; AVX512DQ-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] 280; AVX512DQ-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] 281; AVX512DQ-NEXT: vextractf128 $1, %ymm5, %xmm5 282; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) 283; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) 284; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) 285; AVX512DQ-NEXT: vmovq %xmm0, (%r8) 286; AVX512DQ-NEXT: vmovlps %xmm1, (%r9) 287; AVX512DQ-NEXT: vmovlps %xmm7, (%r10) 288; AVX512DQ-NEXT: vmovlps %xmm5, (%rax) 289; AVX512DQ-NEXT: vzeroupper 290; AVX512DQ-NEXT: retq 291; 292; AVX512DQ-FCP-LABEL: load_i32_stride7_vf2: 293; AVX512DQ-FCP: # %bb.0: 294; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 295; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 296; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm0 297; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 298; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 299; AVX512DQ-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 300; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] 301; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 302; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 303; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] 304; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] 305; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 306; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] 307; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 308; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] 309; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 310; AVX512DQ-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 311; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7] 312; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 313; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 314; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) 315; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) 316; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) 317; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) 318; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r9) 319; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10) 320; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax) 321; AVX512DQ-FCP-NEXT: vzeroupper 322; AVX512DQ-FCP-NEXT: retq 323; 324; AVX512BW-LABEL: load_i32_stride7_vf2: 325; AVX512BW: # %bb.0: 326; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 327; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 328; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 329; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1 330; AVX512BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 331; AVX512BW-NEXT: vmovd %xmm1, %r11d 332; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 333; AVX512BW-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 334; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 335; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] 336; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 337; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] 338; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] 339; AVX512BW-NEXT: vpermps (%rdi), %zmm1, %zmm1 340; AVX512BW-NEXT: vmovaps (%rdi), %ymm5 341; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm6 342; AVX512BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] 343; AVX512BW-NEXT: vextractf128 $1, %ymm7, %xmm7 344; AVX512BW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] 345; AVX512BW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] 346; AVX512BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] 347; AVX512BW-NEXT: vextractf128 $1, %ymm5, %xmm5 348; AVX512BW-NEXT: vmovq %xmm2, (%rsi) 349; AVX512BW-NEXT: vmovq %xmm3, (%rdx) 350; AVX512BW-NEXT: vmovq %xmm4, (%rcx) 351; AVX512BW-NEXT: vmovq %xmm0, (%r8) 352; AVX512BW-NEXT: vmovlps %xmm1, (%r9) 353; AVX512BW-NEXT: vmovlps %xmm7, (%r10) 354; AVX512BW-NEXT: vmovlps %xmm5, (%rax) 355; AVX512BW-NEXT: vzeroupper 356; AVX512BW-NEXT: retq 357; 358; AVX512BW-FCP-LABEL: load_i32_stride7_vf2: 359; AVX512BW-FCP: # %bb.0: 360; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 361; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 362; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm0 363; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 364; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 365; AVX512BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 366; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] 367; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 368; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 369; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] 370; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] 371; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 372; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] 373; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 374; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] 375; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 376; AVX512BW-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 377; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7] 378; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 379; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 380; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) 381; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) 382; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx) 383; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) 384; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r9) 385; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) 386; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax) 387; AVX512BW-FCP-NEXT: vzeroupper 388; AVX512BW-FCP-NEXT: retq 389; 390; AVX512DQ-BW-LABEL: load_i32_stride7_vf2: 391; AVX512DQ-BW: # %bb.0: 392; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 393; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 394; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 395; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1 396; AVX512DQ-BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 397; AVX512DQ-BW-NEXT: vmovd %xmm1, %r11d 398; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 399; AVX512DQ-BW-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 400; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 401; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] 402; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 403; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] 404; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] 405; AVX512DQ-BW-NEXT: vpermps (%rdi), %zmm1, %zmm1 406; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm5 407; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm6 408; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] 409; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm7, %xmm7 410; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] 411; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] 412; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] 413; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm5, %xmm5 414; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) 415; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) 416; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) 417; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) 418; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%r9) 419; AVX512DQ-BW-NEXT: vmovlps %xmm7, (%r10) 420; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%rax) 421; AVX512DQ-BW-NEXT: vzeroupper 422; AVX512DQ-BW-NEXT: retq 423; 424; AVX512DQ-BW-FCP-LABEL: load_i32_stride7_vf2: 425; AVX512DQ-BW-FCP: # %bb.0: 426; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 427; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 428; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm0 429; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 430; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 431; AVX512DQ-BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 432; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] 433; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 434; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 435; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] 436; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] 437; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 438; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] 439; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 440; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] 441; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 442; AVX512DQ-BW-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 443; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7] 444; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 445; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 446; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) 447; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) 448; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx) 449; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) 450; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r9) 451; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) 452; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax) 453; AVX512DQ-BW-FCP-NEXT: vzeroupper 454; AVX512DQ-BW-FCP-NEXT: retq 455 %wide.vec = load <14 x i32>, ptr %in.vec, align 64 456 %strided.vec0 = shufflevector <14 x i32> %wide.vec, <14 x i32> poison, <2 x i32> <i32 0, i32 7> 457 %strided.vec1 = shufflevector <14 x i32> %wide.vec, <14 x i32> poison, <2 x i32> <i32 1, i32 8> 458 %strided.vec2 = shufflevector <14 x i32> %wide.vec, <14 x i32> poison, <2 x i32> <i32 2, i32 9> 459 %strided.vec3 = shufflevector <14 x i32> %wide.vec, <14 x i32> poison, <2 x i32> <i32 3, i32 10> 460 %strided.vec4 = shufflevector <14 x i32> %wide.vec, <14 x i32> poison, <2 x i32> <i32 4, i32 11> 461 %strided.vec5 = shufflevector <14 x i32> %wide.vec, <14 x i32> poison, <2 x i32> <i32 5, i32 12> 462 %strided.vec6 = shufflevector <14 x i32> %wide.vec, <14 x i32> poison, <2 x i32> <i32 6, i32 13> 463 store <2 x i32> %strided.vec0, ptr %out.vec0, align 64 464 store <2 x i32> %strided.vec1, ptr %out.vec1, align 64 465 store <2 x i32> %strided.vec2, ptr %out.vec2, align 64 466 store <2 x i32> %strided.vec3, ptr %out.vec3, align 64 467 store <2 x i32> %strided.vec4, ptr %out.vec4, align 64 468 store <2 x i32> %strided.vec5, ptr %out.vec5, align 64 469 store <2 x i32> %strided.vec6, ptr %out.vec6, align 64 470 ret void 471} 472 473define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { 474; SSE-LABEL: load_i32_stride7_vf4: 475; SSE: # %bb.0: 476; SSE-NEXT: movdqa 96(%rdi), %xmm1 477; SSE-NEXT: movdqa 64(%rdi), %xmm0 478; SSE-NEXT: movdqa 80(%rdi), %xmm2 479; SSE-NEXT: movdqa (%rdi), %xmm11 480; SSE-NEXT: movdqa 16(%rdi), %xmm3 481; SSE-NEXT: movdqa 32(%rdi), %xmm4 482; SSE-NEXT: movdqa 48(%rdi), %xmm6 483; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[3,3,3,3] 484; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,1,1,1] 485; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[2,3,2,3] 486; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[2,2,3,3] 487; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] 488; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] 489; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] 490; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm11[0],xmm5[1] 491; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] 492; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,2,2,2] 493; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,1,1] 494; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] 495; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,1,1] 496; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm11[2],xmm6[3],xmm11[3] 497; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] 498; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] 499; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] 500; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] 501; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,1,1] 502; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] 503; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] 504; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] 505; SSE-NEXT: movdqa %xmm0, %xmm8 506; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] 507; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 508; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi 509; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] 510; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] 511; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] 512; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] 513; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 514; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] 515; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 516; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] 517; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] 518; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] 519; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] 520; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] 521; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 522; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] 523; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 524; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] 525; SSE-NEXT: movapd %xmm5, (%rsi) 526; SSE-NEXT: movapd %xmm6, (%rdx) 527; SSE-NEXT: movapd %xmm11, (%rcx) 528; SSE-NEXT: movapd %xmm8, (%r8) 529; SSE-NEXT: movapd %xmm4, (%r9) 530; SSE-NEXT: movapd %xmm0, (%rdi) 531; SSE-NEXT: movapd %xmm2, (%rax) 532; SSE-NEXT: retq 533; 534; AVX-LABEL: load_i32_stride7_vf4: 535; AVX: # %bb.0: 536; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 537; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 538; AVX-NEXT: vmovaps 32(%rdi), %ymm0 539; AVX-NEXT: vmovaps (%rdi), %ymm1 540; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] 541; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 542; AVX-NEXT: vmovaps (%rdi), %xmm3 543; AVX-NEXT: vmovaps 32(%rdi), %xmm4 544; AVX-NEXT: vmovaps 64(%rdi), %xmm5 545; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] 546; AVX-NEXT: vmovaps 80(%rdi), %xmm6 547; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3] 548; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,1] 549; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3] 550; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0],mem[3,3] 551; AVX-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[2] 552; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm5[0,1,0,1] 553; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3] 554; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm3[2,3,2,3] 555; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2,3] 556; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] 557; AVX-NEXT: vmovaps 96(%rdi), %xmm9 558; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0],xmm5[1],xmm9[2,3] 559; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] 560; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] 561; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,1,0] 562; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm9[0,1,0,1] 563; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3] 564; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] 565; AVX-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] 566; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3] 567; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] 568; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4] 569; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10 570; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm10[2,0],xmm5[3,2] 571; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,0,1] 572; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm9[3] 573; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4] 574; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 575; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,3] 576; AVX-NEXT: vmovaps %xmm2, (%rsi) 577; AVX-NEXT: vmovaps %xmm7, (%rdx) 578; AVX-NEXT: vmovaps %xmm8, (%rcx) 579; AVX-NEXT: vmovaps %xmm3, (%r8) 580; AVX-NEXT: vmovaps %xmm4, (%r9) 581; AVX-NEXT: vmovaps %xmm5, (%r10) 582; AVX-NEXT: vmovaps %xmm0, (%rax) 583; AVX-NEXT: vzeroupper 584; AVX-NEXT: retq 585; 586; AVX2-LABEL: load_i32_stride7_vf4: 587; AVX2: # %bb.0: 588; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 589; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 590; AVX2-NEXT: vmovaps (%rdi), %ymm0 591; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 592; AVX2-NEXT: vmovaps {{.*#+}} xmm2 = [0,7,6,u] 593; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 594; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm2 595; AVX2-NEXT: vbroadcastss 84(%rdi), %xmm3 596; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] 597; AVX2-NEXT: vmovaps 80(%rdi), %xmm4 598; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] 599; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] 600; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7] 601; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] 602; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] 603; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm6 604; AVX2-NEXT: vmovaps 32(%rdi), %xmm7 605; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3] 606; AVX2-NEXT: vmovaps 64(%rdi), %xmm8 607; AVX2-NEXT: vbroadcastss %xmm8, %xmm9 608; AVX2-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] 609; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] 610; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1,2],mem[3] 611; AVX2-NEXT: vmovaps 96(%rdi), %xmm7 612; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm7[0],xmm8[1],xmm7[2,3] 613; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] 614; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,1,0] 615; AVX2-NEXT: vbroadcastss 100(%rdi), %xmm9 616; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] 617; AVX2-NEXT: vmovsd {{.*#+}} xmm10 = [4,3,0,0] 618; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] 619; AVX2-NEXT: vpermps %ymm11, %ymm10, %ymm10 620; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] 621; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] 622; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] 623; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7] 624; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 625; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3] 626; AVX2-NEXT: vbroadcastss 80(%rdi), %ymm8 627; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] 628; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 629; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] 630; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 631; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] 632; AVX2-NEXT: vmovaps %xmm2, (%rsi) 633; AVX2-NEXT: vmovaps %xmm3, (%rdx) 634; AVX2-NEXT: vmovaps %xmm4, (%rcx) 635; AVX2-NEXT: vmovaps %xmm6, (%r8) 636; AVX2-NEXT: vmovaps %xmm9, (%r9) 637; AVX2-NEXT: vmovaps %xmm5, (%r10) 638; AVX2-NEXT: vmovaps %xmm0, (%rax) 639; AVX2-NEXT: vzeroupper 640; AVX2-NEXT: retq 641; 642; AVX2-FP-LABEL: load_i32_stride7_vf4: 643; AVX2-FP: # %bb.0: 644; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 645; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 646; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 647; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1 648; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm2 = [0,7,6,u] 649; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 650; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm2 651; AVX2-FP-NEXT: vbroadcastss 84(%rdi), %xmm3 652; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] 653; AVX2-FP-NEXT: vmovaps 80(%rdi), %xmm4 654; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] 655; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] 656; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7] 657; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] 658; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] 659; AVX2-FP-NEXT: vbroadcastss 8(%rdi), %xmm6 660; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm7 661; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3] 662; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm8 663; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm9 664; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] 665; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] 666; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1,2],mem[3] 667; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm7 668; AVX2-FP-NEXT: vblendps {{.*#+}} xmm9 = xmm7[0],xmm8[1],xmm7[2,3] 669; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] 670; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,1,0] 671; AVX2-FP-NEXT: vbroadcastss 100(%rdi), %xmm9 672; AVX2-FP-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] 673; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm10 = [4,3,0,0] 674; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] 675; AVX2-FP-NEXT: vpermps %ymm11, %ymm10, %ymm10 676; AVX2-FP-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] 677; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] 678; AVX2-FP-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] 679; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7] 680; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 681; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3] 682; AVX2-FP-NEXT: vbroadcastss 80(%rdi), %ymm8 683; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] 684; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 685; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] 686; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 687; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] 688; AVX2-FP-NEXT: vmovaps %xmm2, (%rsi) 689; AVX2-FP-NEXT: vmovaps %xmm3, (%rdx) 690; AVX2-FP-NEXT: vmovaps %xmm4, (%rcx) 691; AVX2-FP-NEXT: vmovaps %xmm6, (%r8) 692; AVX2-FP-NEXT: vmovaps %xmm9, (%r9) 693; AVX2-FP-NEXT: vmovaps %xmm5, (%r10) 694; AVX2-FP-NEXT: vmovaps %xmm0, (%rax) 695; AVX2-FP-NEXT: vzeroupper 696; AVX2-FP-NEXT: retq 697; 698; AVX2-FCP-LABEL: load_i32_stride7_vf4: 699; AVX2-FCP: # %bb.0: 700; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 701; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 702; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 703; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 704; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [0,7,6,u] 705; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 706; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm2 707; AVX2-FCP-NEXT: vbroadcastss 84(%rdi), %xmm3 708; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] 709; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,0,7,0,1,0,7,0] 710; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] 711; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] 712; AVX2-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 713; AVX2-FCP-NEXT: vmovaps 80(%rdi), %xmm5 714; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,2,2,2] 715; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3] 716; AVX2-FCP-NEXT: vbroadcastss 8(%rdi), %xmm6 717; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm7 718; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3] 719; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm8 720; AVX2-FCP-NEXT: vbroadcastss %xmm8, %xmm9 721; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] 722; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] 723; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1,2],mem[3] 724; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm7 725; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm9 = xmm7[0],xmm8[1],xmm7[2,3] 726; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] 727; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,1,0] 728; AVX2-FCP-NEXT: vbroadcastss 100(%rdi), %xmm9 729; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] 730; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm10 = [4,3,0,0] 731; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] 732; AVX2-FCP-NEXT: vpermps %ymm11, %ymm10, %ymm10 733; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] 734; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] 735; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] 736; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,2,3,5,4,6,7] 737; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm4 738; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] 739; AVX2-FCP-NEXT: vbroadcastss 80(%rdi), %ymm8 740; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] 741; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 742; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] 743; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 744; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] 745; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsi) 746; AVX2-FCP-NEXT: vmovaps %xmm3, (%rdx) 747; AVX2-FCP-NEXT: vmovaps %xmm5, (%rcx) 748; AVX2-FCP-NEXT: vmovaps %xmm6, (%r8) 749; AVX2-FCP-NEXT: vmovaps %xmm9, (%r9) 750; AVX2-FCP-NEXT: vmovaps %xmm4, (%r10) 751; AVX2-FCP-NEXT: vmovaps %xmm0, (%rax) 752; AVX2-FCP-NEXT: vzeroupper 753; AVX2-FCP-NEXT: retq 754; 755; AVX512-LABEL: load_i32_stride7_vf4: 756; AVX512: # %bb.0: 757; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 758; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 759; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 760; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 761; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] 762; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 763; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] 764; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 765; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] 766; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 767; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] 768; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 769; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] 770; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 771; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] 772; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 773; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] 774; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 775; AVX512-NEXT: vmovdqa %xmm2, (%rsi) 776; AVX512-NEXT: vmovdqa %xmm3, (%rdx) 777; AVX512-NEXT: vmovdqa %xmm4, (%rcx) 778; AVX512-NEXT: vmovdqa %xmm5, (%r8) 779; AVX512-NEXT: vmovdqa %xmm6, (%r9) 780; AVX512-NEXT: vmovdqa %xmm7, (%r10) 781; AVX512-NEXT: vmovdqa %xmm8, (%rax) 782; AVX512-NEXT: vzeroupper 783; AVX512-NEXT: retq 784; 785; AVX512-FCP-LABEL: load_i32_stride7_vf4: 786; AVX512-FCP: # %bb.0: 787; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 788; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 789; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 790; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 791; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] 792; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 793; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] 794; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 795; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] 796; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 797; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] 798; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 799; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] 800; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 801; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] 802; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 803; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] 804; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 805; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rsi) 806; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) 807; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rcx) 808; AVX512-FCP-NEXT: vmovdqa %xmm5, (%r8) 809; AVX512-FCP-NEXT: vmovdqa %xmm6, (%r9) 810; AVX512-FCP-NEXT: vmovdqa %xmm7, (%r10) 811; AVX512-FCP-NEXT: vmovdqa %xmm8, (%rax) 812; AVX512-FCP-NEXT: vzeroupper 813; AVX512-FCP-NEXT: retq 814; 815; AVX512DQ-LABEL: load_i32_stride7_vf4: 816; AVX512DQ: # %bb.0: 817; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 818; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 819; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 820; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 821; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] 822; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 823; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] 824; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 825; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] 826; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 827; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] 828; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 829; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] 830; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 831; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] 832; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 833; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] 834; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 835; AVX512DQ-NEXT: vmovdqa %xmm2, (%rsi) 836; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) 837; AVX512DQ-NEXT: vmovdqa %xmm4, (%rcx) 838; AVX512DQ-NEXT: vmovdqa %xmm5, (%r8) 839; AVX512DQ-NEXT: vmovdqa %xmm6, (%r9) 840; AVX512DQ-NEXT: vmovdqa %xmm7, (%r10) 841; AVX512DQ-NEXT: vmovdqa %xmm8, (%rax) 842; AVX512DQ-NEXT: vzeroupper 843; AVX512DQ-NEXT: retq 844; 845; AVX512DQ-FCP-LABEL: load_i32_stride7_vf4: 846; AVX512DQ-FCP: # %bb.0: 847; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 848; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 849; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 850; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 851; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] 852; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 853; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] 854; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 855; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] 856; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 857; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] 858; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 859; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] 860; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 861; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] 862; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 863; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] 864; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 865; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rsi) 866; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) 867; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rcx) 868; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%r8) 869; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%r9) 870; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%r10) 871; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%rax) 872; AVX512DQ-FCP-NEXT: vzeroupper 873; AVX512DQ-FCP-NEXT: retq 874; 875; AVX512BW-LABEL: load_i32_stride7_vf4: 876; AVX512BW: # %bb.0: 877; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 878; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 879; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 880; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 881; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] 882; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 883; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] 884; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 885; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] 886; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 887; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] 888; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 889; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] 890; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 891; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] 892; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 893; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] 894; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 895; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) 896; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) 897; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx) 898; AVX512BW-NEXT: vmovdqa %xmm5, (%r8) 899; AVX512BW-NEXT: vmovdqa %xmm6, (%r9) 900; AVX512BW-NEXT: vmovdqa %xmm7, (%r10) 901; AVX512BW-NEXT: vmovdqa %xmm8, (%rax) 902; AVX512BW-NEXT: vzeroupper 903; AVX512BW-NEXT: retq 904; 905; AVX512BW-FCP-LABEL: load_i32_stride7_vf4: 906; AVX512BW-FCP: # %bb.0: 907; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 908; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 909; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 910; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 911; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] 912; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 913; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] 914; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 915; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] 916; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 917; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] 918; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 919; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] 920; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 921; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] 922; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 923; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] 924; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 925; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) 926; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) 927; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) 928; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r8) 929; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r9) 930; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%r10) 931; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%rax) 932; AVX512BW-FCP-NEXT: vzeroupper 933; AVX512BW-FCP-NEXT: retq 934; 935; AVX512DQ-BW-LABEL: load_i32_stride7_vf4: 936; AVX512DQ-BW: # %bb.0: 937; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 938; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 939; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 940; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 941; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] 942; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 943; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] 944; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 945; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] 946; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 947; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] 948; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 949; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] 950; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 951; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] 952; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 953; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] 954; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 955; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) 956; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) 957; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx) 958; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%r8) 959; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%r9) 960; AVX512DQ-BW-NEXT: vmovdqa %xmm7, (%r10) 961; AVX512DQ-BW-NEXT: vmovdqa %xmm8, (%rax) 962; AVX512DQ-BW-NEXT: vzeroupper 963; AVX512DQ-BW-NEXT: retq 964; 965; AVX512DQ-BW-FCP-LABEL: load_i32_stride7_vf4: 966; AVX512DQ-BW-FCP: # %bb.0: 967; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 968; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 969; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 970; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 971; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] 972; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 973; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] 974; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 975; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] 976; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 977; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] 978; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 979; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] 980; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 981; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] 982; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 983; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] 984; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 985; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) 986; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) 987; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) 988; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r8) 989; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r9) 990; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%r10) 991; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%rax) 992; AVX512DQ-BW-FCP-NEXT: vzeroupper 993; AVX512DQ-BW-FCP-NEXT: retq 994 %wide.vec = load <28 x i32>, ptr %in.vec, align 64 995 %strided.vec0 = shufflevector <28 x i32> %wide.vec, <28 x i32> poison, <4 x i32> <i32 0, i32 7, i32 14, i32 21> 996 %strided.vec1 = shufflevector <28 x i32> %wide.vec, <28 x i32> poison, <4 x i32> <i32 1, i32 8, i32 15, i32 22> 997 %strided.vec2 = shufflevector <28 x i32> %wide.vec, <28 x i32> poison, <4 x i32> <i32 2, i32 9, i32 16, i32 23> 998 %strided.vec3 = shufflevector <28 x i32> %wide.vec, <28 x i32> poison, <4 x i32> <i32 3, i32 10, i32 17, i32 24> 999 %strided.vec4 = shufflevector <28 x i32> %wide.vec, <28 x i32> poison, <4 x i32> <i32 4, i32 11, i32 18, i32 25> 1000 %strided.vec5 = shufflevector <28 x i32> %wide.vec, <28 x i32> poison, <4 x i32> <i32 5, i32 12, i32 19, i32 26> 1001 %strided.vec6 = shufflevector <28 x i32> %wide.vec, <28 x i32> poison, <4 x i32> <i32 6, i32 13, i32 20, i32 27> 1002 store <4 x i32> %strided.vec0, ptr %out.vec0, align 64 1003 store <4 x i32> %strided.vec1, ptr %out.vec1, align 64 1004 store <4 x i32> %strided.vec2, ptr %out.vec2, align 64 1005 store <4 x i32> %strided.vec3, ptr %out.vec3, align 64 1006 store <4 x i32> %strided.vec4, ptr %out.vec4, align 64 1007 store <4 x i32> %strided.vec5, ptr %out.vec5, align 64 1008 store <4 x i32> %strided.vec6, ptr %out.vec6, align 64 1009 ret void 1010} 1011 1012define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { 1013; SSE-LABEL: load_i32_stride7_vf8: 1014; SSE: # %bb.0: 1015; SSE-NEXT: subq $24, %rsp 1016; SSE-NEXT: movdqa 144(%rdi), %xmm9 1017; SSE-NEXT: movdqa 80(%rdi), %xmm5 1018; SSE-NEXT: movdqa (%rdi), %xmm12 1019; SSE-NEXT: movdqa 16(%rdi), %xmm11 1020; SSE-NEXT: movdqa 48(%rdi), %xmm6 1021; SSE-NEXT: movdqa 192(%rdi), %xmm8 1022; SSE-NEXT: movdqa 160(%rdi), %xmm10 1023; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1024; SSE-NEXT: movdqa 112(%rdi), %xmm15 1025; SSE-NEXT: movdqa 128(%rdi), %xmm0 1026; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1027; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1028; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] 1029; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] 1030; SSE-NEXT: movdqa %xmm15, %xmm3 1031; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 1032; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] 1033; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] 1034; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 1035; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill 1036; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] 1037; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,1,1] 1038; SSE-NEXT: movdqa %xmm12, %xmm4 1039; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 1040; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1041; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] 1042; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 1043; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 1044; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1045; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] 1046; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] 1047; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1048; SSE-NEXT: movdqa %xmm10, %xmm4 1049; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] 1050; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] 1051; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1052; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] 1053; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1054; SSE-NEXT: movdqa %xmm6, %xmm1 1055; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1056; SSE-NEXT: movdqa 32(%rdi), %xmm4 1057; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 1058; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] 1059; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1060; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] 1061; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1062; SSE-NEXT: movdqa 176(%rdi), %xmm10 1063; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] 1064; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,0,1,1] 1065; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] 1066; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] 1067; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] 1068; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] 1069; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1070; SSE-NEXT: movdqa 64(%rdi), %xmm14 1071; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] 1072; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[0,0,1,1] 1073; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] 1074; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] 1075; SSE-NEXT: movdqa 208(%rdi), %xmm3 1076; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] 1077; SSE-NEXT: movdqa %xmm10, %xmm7 1078; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] 1079; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] 1080; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] 1081; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] 1082; SSE-NEXT: movdqa 96(%rdi), %xmm5 1083; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] 1084; SSE-NEXT: movdqa %xmm14, %xmm15 1085; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] 1086; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] 1087; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] 1088; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] 1089; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] 1090; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 1091; SSE-NEXT: movdqa %xmm6, %xmm12 1092; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] 1093; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,2,3,3] 1094; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] 1095; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm12[0],xmm9[1] 1096; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] 1097; SSE-NEXT: movdqa %xmm11, %xmm12 1098; SSE-NEXT: movdqa %xmm11, %xmm4 1099; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 1100; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3] 1101; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 1102; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] 1103; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] 1104; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm4[2],xmm10[3],xmm4[3] 1105; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] 1106; SSE-NEXT: movdqa %xmm6, %xmm11 1107; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1108; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1109; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm4[0],xmm10[1] 1110; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] 1111; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] 1112; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,1,1] 1113; SSE-NEXT: movdqa %xmm12, %xmm6 1114; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1115; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 1116; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm4[0],xmm14[1] 1117; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] 1118; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[2,3,2,3] 1119; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] 1120; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 1121; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 1122; SSE-NEXT: # xmm4 = mem[0,0,1,1] 1123; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 1124; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm12[0],xmm4[1] 1125; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 1126; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[2,3,2,3] 1127; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] 1128; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] 1129; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 1130; SSE-NEXT: # xmm3 = mem[0,0,1,1] 1131; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 1132; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm12[0],xmm3[1] 1133; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1134; SSE-NEXT: movaps %xmm1, (%rsi) 1135; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload 1136; SSE-NEXT: movaps %xmm1, 16(%rsi) 1137; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1138; SSE-NEXT: movaps %xmm0, (%rdx) 1139; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1140; SSE-NEXT: movaps %xmm0, 16(%rdx) 1141; SSE-NEXT: movapd %xmm8, (%rcx) 1142; SSE-NEXT: movapd %xmm13, 16(%rcx) 1143; SSE-NEXT: movapd %xmm15, (%r8) 1144; SSE-NEXT: movapd %xmm7, 16(%r8) 1145; SSE-NEXT: movapd %xmm2, (%r9) 1146; SSE-NEXT: movapd %xmm9, 16(%r9) 1147; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 1148; SSE-NEXT: movapd %xmm14, (%rax) 1149; SSE-NEXT: movapd %xmm10, 16(%rax) 1150; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 1151; SSE-NEXT: movapd %xmm3, (%rax) 1152; SSE-NEXT: movapd %xmm4, 16(%rax) 1153; SSE-NEXT: addq $24, %rsp 1154; SSE-NEXT: retq 1155; 1156; AVX-LABEL: load_i32_stride7_vf8: 1157; AVX: # %bb.0: 1158; AVX-NEXT: vmovaps 160(%rdi), %ymm4 1159; AVX-NEXT: vmovaps 128(%rdi), %ymm7 1160; AVX-NEXT: vmovaps 64(%rdi), %ymm10 1161; AVX-NEXT: vmovaps 32(%rdi), %ymm0 1162; AVX-NEXT: vmovaps (%rdi), %ymm1 1163; AVX-NEXT: vmovaps 96(%rdi), %ymm12 1164; AVX-NEXT: vmovaps 80(%rdi), %xmm2 1165; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] 1166; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] 1167; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5 1168; AVX-NEXT: vmovaps (%rdi), %xmm14 1169; AVX-NEXT: vmovaps 32(%rdi), %xmm9 1170; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3] 1171; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,3,2,3] 1172; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2],ymm3[3,4,5,6,7] 1173; AVX-NEXT: vmovaps 160(%rdi), %xmm3 1174; AVX-NEXT: vmovaps 128(%rdi), %xmm5 1175; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm5[1],xmm3[1] 1176; AVX-NEXT: vmovaps 192(%rdi), %xmm11 1177; AVX-NEXT: vinsertps {{.*#+}} xmm8 = zero,xmm8[1,2],xmm11[1] 1178; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 1179; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5,6,7] 1180; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm12[1,1],ymm10[2,2],ymm12[5,5],ymm10[6,6] 1181; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3,2,3] 1182; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm9[0],xmm14[1],xmm9[2,3] 1183; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,0],mem[3,3] 1184; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2],ymm8[3,4,5,6,7] 1185; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm7[2,3],ymm4[0,1] 1186; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm7[0,0],ymm13[3,3],ymm7[4,4],ymm13[7,7] 1187; AVX-NEXT: vextractf128 $1, %ymm13, %xmm13 1188; AVX-NEXT: vinsertps {{.*#+}} xmm13 = zero,xmm13[1,2],xmm11[2] 1189; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 1190; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5,6,7] 1191; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm14[2,3,2,3] 1192; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm9[1],xmm13[2,3] 1193; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm2[3,1],ymm10[0,3],ymm2[7,5],ymm10[4,7] 1194; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,1],ymm15[2,0],ymm12[6,5],ymm15[6,4] 1195; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1],ymm15[2,3,4,5,6,7] 1196; AVX-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] 1197; AVX-NEXT: vextractf128 $1, %ymm13, %xmm13 1198; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] 1199; AVX-NEXT: vmovaps 192(%rdi), %ymm13 1200; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 1201; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4],ymm11[5,6,7] 1202; AVX-NEXT: vmovaps 64(%rdi), %xmm15 1203; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0],ymm12[0,0],ymm10[5,4],ymm12[4,4] 1204; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm12[3,1],ymm10[0,2],ymm12[7,5],ymm10[4,6] 1205; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm9[0,1,2],xmm14[3] 1206; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3] 1207; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7] 1208; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm13[0,1],ymm4[1,3],ymm13[4,5],ymm4[5,7] 1209; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm7[0,2],ymm12[2,0],ymm7[4,6],ymm12[6,4] 1210; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] 1211; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,0],ymm4[2,0],ymm13[5,4],ymm4[6,4] 1212; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm7[2,3,0,1] 1213; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm14[0,0],ymm7[7,4],ymm14[4,4] 1214; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0],ymm12[2,0],ymm7[6,4],ymm12[6,4] 1215; AVX-NEXT: vmovaps 96(%rdi), %xmm12 1216; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm12[0,1,0,1] 1217; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] 1218; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] 1219; AVX-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3] 1220; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3] 1221; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] 1222; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm13[2,1],ymm4[3,3],ymm13[6,5],ymm4[7,7] 1223; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm3[0],xmm5[1],xmm3[2,3] 1224; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 1225; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm4[2,0],ymm9[5,4],ymm4[6,4] 1226; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm12[0,1,2],xmm15[3] 1227; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4] 1228; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 1229; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm14[2,0],xmm9[3,2] 1230; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] 1231; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] 1232; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[3] 1233; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4] 1234; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1235; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] 1236; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3,0,1] 1237; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,0],ymm1[0,0],ymm13[7,4],ymm1[4,4] 1238; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] 1239; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] 1240; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 1241; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] 1242; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1243; AVX-NEXT: vmovaps %ymm6, (%rsi) 1244; AVX-NEXT: vmovaps %ymm8, (%rdx) 1245; AVX-NEXT: vmovaps %ymm11, (%rcx) 1246; AVX-NEXT: vmovaps %ymm10, (%r8) 1247; AVX-NEXT: vmovaps %ymm7, (%r9) 1248; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 1249; AVX-NEXT: vmovaps %ymm4, (%rax) 1250; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 1251; AVX-NEXT: vmovaps %ymm0, (%rax) 1252; AVX-NEXT: vzeroupper 1253; AVX-NEXT: retq 1254; 1255; AVX2-LABEL: load_i32_stride7_vf8: 1256; AVX2: # %bb.0: 1257; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 1258; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 1259; AVX2-NEXT: vmovdqa 64(%rdi), %ymm9 1260; AVX2-NEXT: vmovdqa 160(%rdi), %ymm4 1261; AVX2-NEXT: vmovdqa 128(%rdi), %ymm5 1262; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1263; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 1264; AVX2-NEXT: vmovdqa 96(%rdi), %ymm10 1265; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm2 1266; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] 1267; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,7,6,0] 1268; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 1269; AVX2-NEXT: vpermd %ymm6, %ymm3, %ymm3 1270; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] 1271; AVX2-NEXT: vmovdqa 128(%rdi), %xmm6 1272; AVX2-NEXT: vmovdqa 160(%rdi), %xmm3 1273; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm3[1] 1274; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 1275; AVX2-NEXT: vpbroadcastd 196(%rdi), %ymm7 1276; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] 1277; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] 1278; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = mem[2,2,2,2] 1279; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 1280; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm4[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] 1281; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0] 1282; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7] 1283; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] 1284; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] 1285; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6],ymm8[7] 1286; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,0,7,6,5,0,0,0] 1287; AVX2-NEXT: vpermd %ymm7, %ymm11, %ymm7 1288; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] 1289; AVX2-NEXT: vmovdqa 80(%rdi), %xmm7 1290; AVX2-NEXT: vpalignr {{.*#+}} ymm11 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] 1291; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3],ymm11[4,5,6,7] 1292; AVX2-NEXT: vpbroadcastd 8(%rdi), %xmm11 1293; AVX2-NEXT: vmovdqa 32(%rdi), %xmm12 1294; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3] 1295; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7] 1296; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] 1297; AVX2-NEXT: vpbroadcastd 204(%rdi), %ymm13 1298; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7] 1299; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7] 1300; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],mem[3] 1301; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,2,2,3] 1302; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4,5,6,7] 1303; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,1,1,0,7,5,5,4] 1304; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5,6,7] 1305; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,2],ymm4[1,3],ymm5[4,6],ymm4[5,7] 1306; AVX2-NEXT: vbroadcastss 208(%rdi), %ymm11 1307; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] 1308; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7] 1309; AVX2-NEXT: vpbroadcastd 100(%rdi), %xmm10 1310; AVX2-NEXT: vmovdqa 64(%rdi), %xmm11 1311; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] 1312; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm12 = [4,3,0,0] 1313; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1314; AVX2-NEXT: vpermd %ymm13, %ymm12, %ymm12 1315; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] 1316; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm12 = [0,7,0,7,0,7,0,7] 1317; AVX2-NEXT: vpermd %ymm5, %ymm12, %ymm13 1318; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm4[6,7] 1319; AVX2-NEXT: vpbroadcastd 212(%rdi), %ymm14 1320; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] 1321; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] 1322; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] 1323; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] 1324; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] 1325; AVX2-NEXT: vpbroadcastd 216(%rdi), %ymm5 1326; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] 1327; AVX2-NEXT: vmovdqa 96(%rdi), %xmm5 1328; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm5[0,1,2],xmm11[3] 1329; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] 1330; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,0,2,3,5,4,6,7] 1331; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm8 1332; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3] 1333; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] 1334; AVX2-NEXT: vpermd 192(%rdi), %ymm12, %ymm8 1335; AVX2-NEXT: vpbroadcastd 136(%rdi), %xmm11 1336; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2,3] 1337; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 1338; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] 1339; AVX2-NEXT: vpbroadcastd 80(%rdi), %ymm8 1340; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] 1341; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 1342; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] 1343; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1344; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] 1345; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 1346; AVX2-NEXT: vmovdqa %ymm2, (%rsi) 1347; AVX2-NEXT: vmovdqa %ymm6, (%rdx) 1348; AVX2-NEXT: vmovdqa %ymm7, (%rcx) 1349; AVX2-NEXT: vmovdqa %ymm9, (%r8) 1350; AVX2-NEXT: vmovdqa %ymm10, (%r9) 1351; AVX2-NEXT: vmovdqa %ymm4, (%r10) 1352; AVX2-NEXT: vmovdqa %ymm0, (%rax) 1353; AVX2-NEXT: vzeroupper 1354; AVX2-NEXT: retq 1355; 1356; AVX2-FP-LABEL: load_i32_stride7_vf8: 1357; AVX2-FP: # %bb.0: 1358; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1359; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1360; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm9 1361; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm4 1362; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm5 1363; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 1364; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 1365; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm10 1366; AVX2-FP-NEXT: vpbroadcastq 80(%rdi), %ymm2 1367; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] 1368; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,7,6,0] 1369; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 1370; AVX2-FP-NEXT: vpermd %ymm6, %ymm3, %ymm3 1371; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] 1372; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm6 1373; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm3 1374; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm3[1] 1375; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 1376; AVX2-FP-NEXT: vpbroadcastd 196(%rdi), %ymm7 1377; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] 1378; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] 1379; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = mem[2,2,2,2] 1380; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 1381; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm7 = ymm4[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] 1382; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0] 1383; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7] 1384; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] 1385; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] 1386; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6],ymm8[7] 1387; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,0,7,6,5,0,0,0] 1388; AVX2-FP-NEXT: vpermd %ymm7, %ymm11, %ymm7 1389; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] 1390; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm7 1391; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm11 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] 1392; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3],ymm11[4,5,6,7] 1393; AVX2-FP-NEXT: vpbroadcastd 8(%rdi), %xmm11 1394; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm12 1395; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3] 1396; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7] 1397; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] 1398; AVX2-FP-NEXT: vpbroadcastd 204(%rdi), %ymm13 1399; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7] 1400; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7] 1401; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],mem[3] 1402; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,2,2,3] 1403; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4,5,6,7] 1404; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,1,1,0,7,5,5,4] 1405; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5,6,7] 1406; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,2],ymm4[1,3],ymm5[4,6],ymm4[5,7] 1407; AVX2-FP-NEXT: vbroadcastss 208(%rdi), %ymm11 1408; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] 1409; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7] 1410; AVX2-FP-NEXT: vpbroadcastd 100(%rdi), %xmm10 1411; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm11 1412; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] 1413; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [4,3,0,0] 1414; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1415; AVX2-FP-NEXT: vpermd %ymm13, %ymm12, %ymm12 1416; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] 1417; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [0,7,0,7,0,7,0,7] 1418; AVX2-FP-NEXT: vpermd %ymm5, %ymm12, %ymm13 1419; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm4[6,7] 1420; AVX2-FP-NEXT: vpbroadcastd 212(%rdi), %ymm14 1421; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] 1422; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] 1423; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] 1424; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] 1425; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] 1426; AVX2-FP-NEXT: vpbroadcastd 216(%rdi), %ymm5 1427; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] 1428; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm5 1429; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm5[0,1,2],xmm11[3] 1430; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] 1431; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,0,2,3,5,4,6,7] 1432; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm8 1433; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3] 1434; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] 1435; AVX2-FP-NEXT: vpermd 192(%rdi), %ymm12, %ymm8 1436; AVX2-FP-NEXT: vpbroadcastd 136(%rdi), %xmm11 1437; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2,3] 1438; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 1439; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] 1440; AVX2-FP-NEXT: vpbroadcastd 80(%rdi), %ymm8 1441; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] 1442; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 1443; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] 1444; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 1445; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] 1446; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 1447; AVX2-FP-NEXT: vmovdqa %ymm2, (%rsi) 1448; AVX2-FP-NEXT: vmovdqa %ymm6, (%rdx) 1449; AVX2-FP-NEXT: vmovdqa %ymm7, (%rcx) 1450; AVX2-FP-NEXT: vmovdqa %ymm9, (%r8) 1451; AVX2-FP-NEXT: vmovdqa %ymm10, (%r9) 1452; AVX2-FP-NEXT: vmovdqa %ymm4, (%r10) 1453; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax) 1454; AVX2-FP-NEXT: vzeroupper 1455; AVX2-FP-NEXT: retq 1456; 1457; AVX2-FCP-LABEL: load_i32_stride7_vf8: 1458; AVX2-FCP: # %bb.0: 1459; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1460; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1461; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm9 1462; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 1463; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 1464; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 1465; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 1466; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm10 1467; AVX2-FCP-NEXT: vpbroadcastq 80(%rdi), %ymm2 1468; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] 1469; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,7,6,0] 1470; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 1471; AVX2-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3 1472; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] 1473; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm6 1474; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm3 1475; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm3[1] 1476; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 1477; AVX2-FCP-NEXT: vpbroadcastd 196(%rdi), %ymm7 1478; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] 1479; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] 1480; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = mem[2,2,2,2] 1481; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 1482; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm4[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] 1483; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0] 1484; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7] 1485; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] 1486; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] 1487; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6],ymm8[7] 1488; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,0,7,6,5,0,0,0] 1489; AVX2-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm7 1490; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] 1491; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm7 1492; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] 1493; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3],ymm11[4,5,6,7] 1494; AVX2-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm11 1495; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 1496; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3] 1497; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7] 1498; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] 1499; AVX2-FCP-NEXT: vpbroadcastd 204(%rdi), %ymm13 1500; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7] 1501; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7] 1502; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],mem[3] 1503; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,2,2,3] 1504; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4,5,6,7] 1505; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,1,1,0,7,5,5,4] 1506; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5,6,7] 1507; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,2],ymm4[1,3],ymm5[4,6],ymm4[5,7] 1508; AVX2-FCP-NEXT: vbroadcastss 208(%rdi), %ymm11 1509; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] 1510; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7] 1511; AVX2-FCP-NEXT: vpbroadcastd 100(%rdi), %xmm10 1512; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm11 1513; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] 1514; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [4,3,0,0] 1515; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1516; AVX2-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 1517; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] 1518; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [0,7,0,7,0,7,0,7] 1519; AVX2-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm13 1520; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm4[6,7] 1521; AVX2-FCP-NEXT: vpbroadcastd 212(%rdi), %ymm14 1522; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] 1523; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] 1524; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] 1525; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,1,7] 1526; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 1527; AVX2-FCP-NEXT: vpbroadcastd 216(%rdi), %ymm5 1528; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] 1529; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 1530; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm5[0,1,2],xmm11[3] 1531; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] 1532; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,0,2,3,5,4,6,7] 1533; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 1534; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3] 1535; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] 1536; AVX2-FCP-NEXT: vpermd 192(%rdi), %ymm12, %ymm8 1537; AVX2-FCP-NEXT: vpbroadcastd 136(%rdi), %xmm11 1538; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2,3] 1539; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 1540; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] 1541; AVX2-FCP-NEXT: vpbroadcastd 80(%rdi), %ymm8 1542; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] 1543; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 1544; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] 1545; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 1546; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] 1547; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 1548; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rsi) 1549; AVX2-FCP-NEXT: vmovdqa %ymm6, (%rdx) 1550; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rcx) 1551; AVX2-FCP-NEXT: vmovdqa %ymm9, (%r8) 1552; AVX2-FCP-NEXT: vmovdqa %ymm10, (%r9) 1553; AVX2-FCP-NEXT: vmovdqa %ymm4, (%r10) 1554; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) 1555; AVX2-FCP-NEXT: vzeroupper 1556; AVX2-FCP-NEXT: retq 1557; 1558; AVX512-LABEL: load_i32_stride7_vf8: 1559; AVX512: # %bb.0: 1560; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 1561; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 1562; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 1563; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 1564; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 1565; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 1566; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] 1567; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1568; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] 1569; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 1570; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] 1571; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] 1572; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 1573; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] 1574; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 1575; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] 1576; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] 1577; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1578; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] 1579; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 1580; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] 1581; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] 1582; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 1583; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] 1584; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 1585; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] 1586; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] 1587; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 1588; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] 1589; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 1590; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1591; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] 1592; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 1593; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] 1594; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 1595; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 1596; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] 1597; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 1598; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] 1599; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 1600; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] 1601; AVX512-NEXT: vmovdqa %ymm4, (%rsi) 1602; AVX512-NEXT: vmovdqa %ymm5, (%rdx) 1603; AVX512-NEXT: vmovdqa %ymm6, (%rcx) 1604; AVX512-NEXT: vmovdqa %ymm7, (%r8) 1605; AVX512-NEXT: vmovdqa %ymm8, (%r9) 1606; AVX512-NEXT: vmovdqa %ymm9, (%r10) 1607; AVX512-NEXT: vmovdqa %ymm0, (%rax) 1608; AVX512-NEXT: vzeroupper 1609; AVX512-NEXT: retq 1610; 1611; AVX512-FCP-LABEL: load_i32_stride7_vf8: 1612; AVX512-FCP: # %bb.0: 1613; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1614; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1615; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1616; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1617; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 1618; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 1619; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] 1620; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1621; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] 1622; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 1623; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] 1624; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] 1625; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 1626; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] 1627; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 1628; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] 1629; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] 1630; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1631; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] 1632; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 1633; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] 1634; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] 1635; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 1636; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] 1637; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 1638; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] 1639; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] 1640; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 1641; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] 1642; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 1643; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1644; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] 1645; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 1646; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] 1647; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 1648; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 1649; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] 1650; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 1651; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] 1652; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 1653; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] 1654; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rsi) 1655; AVX512-FCP-NEXT: vmovdqa %ymm5, (%rdx) 1656; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rcx) 1657; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r8) 1658; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r9) 1659; AVX512-FCP-NEXT: vmovdqa %ymm9, (%r10) 1660; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax) 1661; AVX512-FCP-NEXT: vzeroupper 1662; AVX512-FCP-NEXT: retq 1663; 1664; AVX512DQ-LABEL: load_i32_stride7_vf8: 1665; AVX512DQ: # %bb.0: 1666; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 1667; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 1668; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 1669; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 1670; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 1671; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 1672; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] 1673; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1674; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] 1675; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 1676; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] 1677; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] 1678; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 1679; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] 1680; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 1681; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] 1682; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] 1683; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1684; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] 1685; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 1686; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] 1687; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] 1688; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 1689; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] 1690; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 1691; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] 1692; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] 1693; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 1694; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] 1695; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 1696; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1697; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] 1698; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 1699; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] 1700; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 1701; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 1702; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] 1703; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 1704; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] 1705; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 1706; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] 1707; AVX512DQ-NEXT: vmovdqa %ymm4, (%rsi) 1708; AVX512DQ-NEXT: vmovdqa %ymm5, (%rdx) 1709; AVX512DQ-NEXT: vmovdqa %ymm6, (%rcx) 1710; AVX512DQ-NEXT: vmovdqa %ymm7, (%r8) 1711; AVX512DQ-NEXT: vmovdqa %ymm8, (%r9) 1712; AVX512DQ-NEXT: vmovdqa %ymm9, (%r10) 1713; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax) 1714; AVX512DQ-NEXT: vzeroupper 1715; AVX512DQ-NEXT: retq 1716; 1717; AVX512DQ-FCP-LABEL: load_i32_stride7_vf8: 1718; AVX512DQ-FCP: # %bb.0: 1719; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1720; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1721; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1722; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1723; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 1724; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 1725; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] 1726; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1727; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] 1728; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 1729; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] 1730; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] 1731; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 1732; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] 1733; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 1734; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] 1735; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] 1736; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1737; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] 1738; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 1739; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] 1740; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] 1741; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 1742; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] 1743; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 1744; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] 1745; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] 1746; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 1747; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] 1748; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 1749; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1750; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] 1751; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 1752; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] 1753; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 1754; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 1755; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] 1756; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 1757; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] 1758; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 1759; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] 1760; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rsi) 1761; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%rdx) 1762; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rcx) 1763; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r8) 1764; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r9) 1765; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%r10) 1766; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax) 1767; AVX512DQ-FCP-NEXT: vzeroupper 1768; AVX512DQ-FCP-NEXT: retq 1769; 1770; AVX512BW-LABEL: load_i32_stride7_vf8: 1771; AVX512BW: # %bb.0: 1772; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1773; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 1774; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1775; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 1776; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 1777; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 1778; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] 1779; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1780; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] 1781; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 1782; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] 1783; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] 1784; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 1785; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] 1786; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 1787; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] 1788; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] 1789; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1790; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] 1791; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 1792; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] 1793; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] 1794; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 1795; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] 1796; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 1797; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] 1798; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] 1799; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 1800; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] 1801; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 1802; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1803; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] 1804; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 1805; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] 1806; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 1807; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 1808; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] 1809; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 1810; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] 1811; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 1812; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] 1813; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi) 1814; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx) 1815; AVX512BW-NEXT: vmovdqa %ymm6, (%rcx) 1816; AVX512BW-NEXT: vmovdqa %ymm7, (%r8) 1817; AVX512BW-NEXT: vmovdqa %ymm8, (%r9) 1818; AVX512BW-NEXT: vmovdqa %ymm9, (%r10) 1819; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) 1820; AVX512BW-NEXT: vzeroupper 1821; AVX512BW-NEXT: retq 1822; 1823; AVX512BW-FCP-LABEL: load_i32_stride7_vf8: 1824; AVX512BW-FCP: # %bb.0: 1825; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1826; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1827; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1828; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1829; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 1830; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 1831; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] 1832; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1833; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] 1834; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 1835; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] 1836; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] 1837; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 1838; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] 1839; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 1840; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] 1841; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] 1842; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1843; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] 1844; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 1845; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] 1846; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] 1847; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 1848; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] 1849; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 1850; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] 1851; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] 1852; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 1853; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] 1854; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 1855; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1856; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] 1857; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 1858; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] 1859; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 1860; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 1861; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] 1862; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 1863; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] 1864; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 1865; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] 1866; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) 1867; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rdx) 1868; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) 1869; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r8) 1870; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r9) 1871; AVX512BW-FCP-NEXT: vmovdqa %ymm9, (%r10) 1872; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) 1873; AVX512BW-FCP-NEXT: vzeroupper 1874; AVX512BW-FCP-NEXT: retq 1875; 1876; AVX512DQ-BW-LABEL: load_i32_stride7_vf8: 1877; AVX512DQ-BW: # %bb.0: 1878; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1879; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 1880; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 1881; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 1882; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 1883; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 1884; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] 1885; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1886; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] 1887; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 1888; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] 1889; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] 1890; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 1891; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] 1892; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 1893; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] 1894; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] 1895; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1896; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] 1897; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 1898; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] 1899; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] 1900; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 1901; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] 1902; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 1903; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] 1904; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] 1905; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 1906; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] 1907; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 1908; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1909; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] 1910; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 1911; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] 1912; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 1913; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 1914; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] 1915; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 1916; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] 1917; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 1918; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] 1919; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rsi) 1920; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%rdx) 1921; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%rcx) 1922; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%r8) 1923; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%r9) 1924; AVX512DQ-BW-NEXT: vmovdqa %ymm9, (%r10) 1925; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax) 1926; AVX512DQ-BW-NEXT: vzeroupper 1927; AVX512DQ-BW-NEXT: retq 1928; 1929; AVX512DQ-BW-FCP-LABEL: load_i32_stride7_vf8: 1930; AVX512DQ-BW-FCP: # %bb.0: 1931; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1932; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1933; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1934; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1935; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 1936; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 1937; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] 1938; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1939; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] 1940; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 1941; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] 1942; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] 1943; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 1944; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] 1945; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 1946; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] 1947; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] 1948; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1949; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] 1950; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 1951; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] 1952; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] 1953; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 1954; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] 1955; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 1956; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] 1957; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] 1958; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 1959; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] 1960; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 1961; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1962; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] 1963; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 1964; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] 1965; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 1966; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 1967; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] 1968; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 1969; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] 1970; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 1971; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] 1972; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) 1973; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rdx) 1974; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) 1975; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r8) 1976; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r9) 1977; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, (%r10) 1978; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) 1979; AVX512DQ-BW-FCP-NEXT: vzeroupper 1980; AVX512DQ-BW-FCP-NEXT: retq 1981 %wide.vec = load <56 x i32>, ptr %in.vec, align 64 1982 %strided.vec0 = shufflevector <56 x i32> %wide.vec, <56 x i32> poison, <8 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49> 1983 %strided.vec1 = shufflevector <56 x i32> %wide.vec, <56 x i32> poison, <8 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50> 1984 %strided.vec2 = shufflevector <56 x i32> %wide.vec, <56 x i32> poison, <8 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51> 1985 %strided.vec3 = shufflevector <56 x i32> %wide.vec, <56 x i32> poison, <8 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52> 1986 %strided.vec4 = shufflevector <56 x i32> %wide.vec, <56 x i32> poison, <8 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53> 1987 %strided.vec5 = shufflevector <56 x i32> %wide.vec, <56 x i32> poison, <8 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54> 1988 %strided.vec6 = shufflevector <56 x i32> %wide.vec, <56 x i32> poison, <8 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55> 1989 store <8 x i32> %strided.vec0, ptr %out.vec0, align 64 1990 store <8 x i32> %strided.vec1, ptr %out.vec1, align 64 1991 store <8 x i32> %strided.vec2, ptr %out.vec2, align 64 1992 store <8 x i32> %strided.vec3, ptr %out.vec3, align 64 1993 store <8 x i32> %strided.vec4, ptr %out.vec4, align 64 1994 store <8 x i32> %strided.vec5, ptr %out.vec5, align 64 1995 store <8 x i32> %strided.vec6, ptr %out.vec6, align 64 1996 ret void 1997} 1998 1999define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { 2000; SSE-LABEL: load_i32_stride7_vf16: 2001; SSE: # %bb.0: 2002; SSE-NEXT: subq $440, %rsp # imm = 0x1B8 2003; SSE-NEXT: movdqa 304(%rdi), %xmm3 2004; SSE-NEXT: movdqa 272(%rdi), %xmm5 2005; SSE-NEXT: movdqa 224(%rdi), %xmm15 2006; SSE-NEXT: movdqa 240(%rdi), %xmm6 2007; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2008; SSE-NEXT: movdqa 80(%rdi), %xmm7 2009; SSE-NEXT: movdqa (%rdi), %xmm2 2010; SSE-NEXT: movdqa 16(%rdi), %xmm8 2011; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2012; SSE-NEXT: movdqa 48(%rdi), %xmm9 2013; SSE-NEXT: movdqa 192(%rdi), %xmm14 2014; SSE-NEXT: movdqa 160(%rdi), %xmm12 2015; SSE-NEXT: movdqa 112(%rdi), %xmm4 2016; SSE-NEXT: movdqa 128(%rdi), %xmm0 2017; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2018; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 2019; SSE-NEXT: movdqa %xmm4, %xmm1 2020; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2021; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] 2022; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2023; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] 2024; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2025; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2026; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] 2027; SSE-NEXT: movdqa %xmm2, %xmm1 2028; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2029; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] 2030; SSE-NEXT: movdqa %xmm9, %xmm11 2031; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2032; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 2033; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2034; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2035; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] 2036; SSE-NEXT: movdqa %xmm15, %xmm1 2037; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2038; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] 2039; SSE-NEXT: movdqa %xmm5, %xmm9 2040; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2041; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 2042; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2043; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2044; SSE-NEXT: movdqa 336(%rdi), %xmm1 2045; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2046; SSE-NEXT: movdqa 352(%rdi), %xmm0 2047; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2048; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 2049; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2050; SSE-NEXT: movdqa 416(%rdi), %xmm8 2051; SSE-NEXT: movdqa 384(%rdi), %xmm13 2052; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] 2053; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2054; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] 2055; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2056; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2057; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] 2058; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2059; SSE-NEXT: movdqa %xmm12, %xmm1 2060; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2061; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] 2062; SSE-NEXT: movdqa %xmm4, %xmm5 2063; SSE-NEXT: movdqa 144(%rdi), %xmm4 2064; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2065; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2066; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2067; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2068; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] 2069; SSE-NEXT: movdqa %xmm7, %xmm12 2070; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2071; SSE-NEXT: movdqa %xmm11, %xmm1 2072; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2073; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] 2074; SSE-NEXT: movdqa %xmm2, %xmm10 2075; SSE-NEXT: movdqa 32(%rdi), %xmm7 2076; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 2077; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill 2078; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2079; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2080; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2081; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] 2082; SSE-NEXT: movdqa %xmm9, %xmm1 2083; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2084; SSE-NEXT: movdqa %xmm15, %xmm11 2085; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2086; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] 2087; SSE-NEXT: movdqa 256(%rdi), %xmm15 2088; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] 2089; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2090; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2091; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] 2092; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2093; SSE-NEXT: movdqa %xmm13, %xmm1 2094; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2095; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 2096; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] 2097; SSE-NEXT: movdqa 368(%rdi), %xmm2 2098; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2099; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2100; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2101; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] 2102; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] 2103; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2104; SSE-NEXT: movdqa 176(%rdi), %xmm4 2105; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] 2106; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,1,1] 2107; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] 2108; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] 2109; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2110; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] 2111; SSE-NEXT: movdqa %xmm10, %xmm14 2112; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] 2113; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2114; SSE-NEXT: movdqa 64(%rdi), %xmm9 2115; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] 2116; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,1,1] 2117; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] 2118; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] 2119; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2120; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] 2121; SSE-NEXT: movdqa %xmm15, %xmm12 2122; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] 2123; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2124; SSE-NEXT: movdqa 288(%rdi), %xmm15 2125; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] 2126; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] 2127; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] 2128; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2129; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2130; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[2,3,2,3] 2131; SSE-NEXT: movdqa %xmm13, %xmm1 2132; SSE-NEXT: movdqa %xmm2, %xmm11 2133; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] 2134; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] 2135; SSE-NEXT: movdqa 400(%rdi), %xmm13 2136; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] 2137; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,1,1] 2138; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 2139; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] 2140; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2141; SSE-NEXT: movdqa 208(%rdi), %xmm10 2142; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,0,1,1] 2143; SSE-NEXT: movdqa %xmm4, %xmm2 2144; SSE-NEXT: movdqa %xmm4, %xmm0 2145; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 2146; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] 2147; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 2148; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] 2149; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 2150; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2151; SSE-NEXT: movdqa 96(%rdi), %xmm0 2152; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2153; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] 2154; SSE-NEXT: movdqa %xmm9, %xmm3 2155; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2156; SSE-NEXT: movdqa %xmm9, %xmm0 2157; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2158; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,2,3,3] 2159; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload 2160; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] 2161; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 2162; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2163; SSE-NEXT: movdqa 320(%rdi), %xmm7 2164; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,1,1] 2165; SSE-NEXT: movdqa %xmm15, %xmm0 2166; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2167; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 2168; SSE-NEXT: # xmm4 = mem[2,2,3,3] 2169; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm12[2],xmm4[3],xmm12[3] 2170; SSE-NEXT: movdqa %xmm12, %xmm14 2171; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 2172; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2173; SSE-NEXT: movdqa 432(%rdi), %xmm0 2174; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2175; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 2176; SSE-NEXT: movdqa %xmm13, %xmm4 2177; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 2178; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] 2179; SSE-NEXT: movdqa %xmm11, %xmm9 2180; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] 2181; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] 2182; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2183; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] 2184; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 2185; SSE-NEXT: movdqa %xmm8, %xmm5 2186; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 2187; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] 2188; SSE-NEXT: movdqa %xmm2, %xmm1 2189; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] 2190; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] 2191; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2192; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] 2193; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2194; SSE-NEXT: movdqa %xmm4, %xmm0 2195; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 2196; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[2,2,3,3] 2197; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 2198; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] 2199; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] 2200; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] 2201; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 2202; SSE-NEXT: movdqa %xmm6, %xmm5 2203; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 2204; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm15[2,2,3,3] 2205; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] 2206; SSE-NEXT: movdqa %xmm7, %xmm11 2207; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2208; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm5[0],xmm14[1] 2209; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] 2210; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 2211; SSE-NEXT: movdqa %xmm7, %xmm5 2212; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 2213; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[2,2,3,3] 2214; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2215; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] 2216; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] 2217; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] 2218; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2219; SSE-NEXT: movdqa %xmm8, %xmm5 2220; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] 2221; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 2222; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] 2223; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2224; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2225; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] 2226; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2227; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2228; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] 2229; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2230; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2231; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2232; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2233; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] 2234; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] 2235; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] 2236; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2237; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2238; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] 2239; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] 2240; SSE-NEXT: movdqa %xmm3, %xmm11 2241; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] 2242; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] 2243; SSE-NEXT: movdqa %xmm7, %xmm2 2244; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2245; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 2246; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] 2247; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] 2248; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] 2249; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 2250; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] 2251; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 2252; SSE-NEXT: # xmm10 = mem[0,0,1,1] 2253; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] 2254; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm5[0],xmm10[1] 2255; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] 2256; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 2257; SSE-NEXT: # xmm5 = mem[2,3,2,3] 2258; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 2259; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2260; SSE-NEXT: # xmm0 = mem[2,3,2,3] 2261; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 2262; SSE-NEXT: # xmm7 = mem[0,0,1,1] 2263; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] 2264; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] 2265; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 2266; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] 2267; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 2268; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2269; SSE-NEXT: # xmm0 = mem[2,3,2,3] 2270; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 2271; SSE-NEXT: # xmm6 = mem[0,0,1,1] 2272; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] 2273; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] 2274; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] 2275; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] 2276; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 2277; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] 2278; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 2279; SSE-NEXT: # xmm4 = mem[0,0,1,1] 2280; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 2281; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] 2282; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2283; SSE-NEXT: movaps %xmm0, 48(%rsi) 2284; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2285; SSE-NEXT: movaps %xmm0, 32(%rsi) 2286; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2287; SSE-NEXT: movaps %xmm0, (%rsi) 2288; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2289; SSE-NEXT: movaps %xmm0, 16(%rsi) 2290; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2291; SSE-NEXT: movaps %xmm0, 48(%rdx) 2292; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2293; SSE-NEXT: movaps %xmm0, 32(%rdx) 2294; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2295; SSE-NEXT: movaps %xmm0, (%rdx) 2296; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2297; SSE-NEXT: movaps %xmm0, 16(%rdx) 2298; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2299; SSE-NEXT: movaps %xmm0, 48(%rcx) 2300; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2301; SSE-NEXT: movaps %xmm0, 32(%rcx) 2302; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2303; SSE-NEXT: movaps %xmm0, (%rcx) 2304; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2305; SSE-NEXT: movaps %xmm0, 16(%rcx) 2306; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2307; SSE-NEXT: movaps %xmm0, 48(%r8) 2308; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2309; SSE-NEXT: movaps %xmm0, 32(%r8) 2310; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2311; SSE-NEXT: movaps %xmm0, (%r8) 2312; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2313; SSE-NEXT: movaps %xmm0, 16(%r8) 2314; SSE-NEXT: movapd %xmm9, 48(%r9) 2315; SSE-NEXT: movapd %xmm14, 32(%r9) 2316; SSE-NEXT: movapd %xmm12, (%r9) 2317; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2318; SSE-NEXT: movaps %xmm0, 16(%r9) 2319; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 2320; SSE-NEXT: movapd %xmm13, 48(%rax) 2321; SSE-NEXT: movapd %xmm15, 32(%rax) 2322; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2323; SSE-NEXT: movaps %xmm0, (%rax) 2324; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2325; SSE-NEXT: movaps %xmm0, 16(%rax) 2326; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 2327; SSE-NEXT: movapd %xmm4, 48(%rax) 2328; SSE-NEXT: movapd %xmm6, 32(%rax) 2329; SSE-NEXT: movapd %xmm7, (%rax) 2330; SSE-NEXT: movapd %xmm10, 16(%rax) 2331; SSE-NEXT: addq $440, %rsp # imm = 0x1B8 2332; SSE-NEXT: retq 2333; 2334; AVX-LABEL: load_i32_stride7_vf16: 2335; AVX: # %bb.0: 2336; AVX-NEXT: subq $456, %rsp # imm = 0x1C8 2337; AVX-NEXT: vmovaps 32(%rdi), %ymm4 2338; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2339; AVX-NEXT: vmovaps (%rdi), %ymm6 2340; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2341; AVX-NEXT: vmovaps 96(%rdi), %ymm15 2342; AVX-NEXT: vmovaps 256(%rdi), %ymm2 2343; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2344; AVX-NEXT: vmovaps 224(%rdi), %ymm1 2345; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2346; AVX-NEXT: vmovaps 320(%rdi), %ymm5 2347; AVX-NEXT: vmovaps 304(%rdi), %xmm0 2348; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2349; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] 2350; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] 2351; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 2352; AVX-NEXT: vmovaps 224(%rdi), %xmm13 2353; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] 2354; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] 2355; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 2356; AVX-NEXT: vmovaps 384(%rdi), %xmm2 2357; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2358; AVX-NEXT: vmovaps 352(%rdi), %xmm1 2359; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2360; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 2361; AVX-NEXT: vmovaps 416(%rdi), %xmm12 2362; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm12[1] 2363; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 2364; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 2365; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2366; AVX-NEXT: vmovaps 80(%rdi), %xmm0 2367; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2368; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] 2369; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm4[6],ymm6[7] 2370; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 2371; AVX-NEXT: vmovaps (%rdi), %xmm9 2372; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] 2373; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] 2374; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 2375; AVX-NEXT: vmovaps 160(%rdi), %xmm2 2376; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2377; AVX-NEXT: vmovaps 128(%rdi), %xmm1 2378; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2379; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 2380; AVX-NEXT: vmovaps 192(%rdi), %xmm8 2381; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[1] 2382; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 2383; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 2384; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2385; AVX-NEXT: vmovaps 288(%rdi), %ymm6 2386; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm6[2,2],ymm5[5,5],ymm6[6,6] 2387; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 2388; AVX-NEXT: vmovaps 256(%rdi), %xmm11 2389; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0],xmm13[1],xmm11[2,3] 2390; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] 2391; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 2392; AVX-NEXT: vmovaps 384(%rdi), %ymm7 2393; AVX-NEXT: vmovaps 352(%rdi), %ymm1 2394; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm7[0,1] 2395; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7] 2396; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 2397; AVX-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm12[2] 2398; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 2399; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] 2400; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2401; AVX-NEXT: vmovaps 64(%rdi), %ymm3 2402; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm3[2,2],ymm15[5,5],ymm3[6,6] 2403; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 2404; AVX-NEXT: vmovaps 32(%rdi), %xmm10 2405; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0],xmm9[1],xmm10[2,3] 2406; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3] 2407; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm0[3,4,5,6,7] 2408; AVX-NEXT: vmovaps 160(%rdi), %ymm4 2409; AVX-NEXT: vmovaps 128(%rdi), %ymm0 2410; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm4[0,1] 2411; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm14[3,3],ymm0[4,4],ymm14[7,7] 2412; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 2413; AVX-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm8[2] 2414; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 2415; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] 2416; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2417; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm13[2,3,2,3] 2418; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] 2419; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 2420; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm6[0,3],ymm14[7,5],ymm6[4,7] 2421; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm5[2,1],ymm14[2,0],ymm5[6,5],ymm14[6,4] 2422; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7] 2423; AVX-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] 2424; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 2425; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] 2426; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 2427; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5,6,7] 2428; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2429; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3] 2430; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] 2431; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 2432; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm3[0,3],ymm12[7,5],ymm3[4,7] 2433; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm15[2,1],ymm12[2,0],ymm15[6,5],ymm12[6,4] 2434; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3,4,5,6,7] 2435; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] 2436; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 2437; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3] 2438; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 2439; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7] 2440; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2441; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm5[0,0],ymm6[5,4],ymm5[4,4] 2442; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm2[0,2],ymm5[7,5],ymm2[4,6] 2443; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm13[3] 2444; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] 2445; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3,4,5,6,7] 2446; AVX-NEXT: vmovaps 416(%rdi), %ymm2 2447; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,1],ymm7[1,3],ymm2[4,5],ymm7[5,7] 2448; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm1[0,2],ymm6[2,0],ymm1[4,6],ymm6[6,4] 2449; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] 2450; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2451; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0],ymm15[0,0],ymm3[5,4],ymm15[4,4] 2452; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm15[3,1],ymm3[0,2],ymm15[7,5],ymm3[4,6] 2453; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1,2],xmm9[3] 2454; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] 2455; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] 2456; AVX-NEXT: vmovaps 192(%rdi), %ymm6 2457; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1],ymm4[1,3],ymm6[4,5],ymm4[5,7] 2458; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm5[2,0],ymm0[4,6],ymm5[6,4] 2459; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] 2460; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2461; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] 2462; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4] 2463; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm7[2,0],ymm2[5,4],ymm7[6,4] 2464; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4] 2465; AVX-NEXT: vmovaps 320(%rdi), %xmm5 2466; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,1,0,1] 2467; AVX-NEXT: vmovaps 288(%rdi), %xmm8 2468; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] 2469; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm11[2,3,2,3] 2470; AVX-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3] 2471; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] 2472; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] 2473; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2474; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1] 2475; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4] 2476; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] 2477; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] 2478; AVX-NEXT: vmovaps 64(%rdi), %xmm11 2479; AVX-NEXT: vmovaps 96(%rdi), %xmm9 2480; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm9[0,1,0,1] 2481; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3] 2482; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,3,2,3] 2483; AVX-NEXT: vblendps {{.*#+}} xmm10 = mem[0],xmm10[1],mem[2,3] 2484; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] 2485; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 2486; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 2487; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,1],ymm7[3,3],ymm2[6,5],ymm7[7,7] 2488; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 2489; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2490; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0],xmm0[1],xmm14[2,3] 2491; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 2492; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,0],ymm3[2,0],ymm7[5,4],ymm3[6,4] 2493; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm8[3] 2494; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 2495; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 2496; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm13[0,0],ymm12[1,0],ymm13[4,4],ymm12[5,4] 2497; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 2498; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,0],xmm7[3,2] 2499; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm3[4,5,6,7] 2500; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,1],ymm4[3,3],ymm6[6,5],ymm4[7,7] 2501; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2502; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2503; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0],xmm1[1],xmm3[2,3] 2504; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 2505; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,0],ymm4[2,0],ymm7[5,4],ymm4[6,4] 2506; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm11[3] 2507; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 2508; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 2509; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm10[0,0],ymm11[1,0],ymm10[4,4],ymm11[5,4] 2510; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 2511; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,0],xmm7[3,2] 2512; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] 2513; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 2514; AVX-NEXT: # xmm7 = mem[0,1,0,1] 2515; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] 2516; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,0],ymm12[2,0],ymm13[5,4],ymm12[6,4] 2517; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7 2518; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,0],xmm5[2,3] 2519; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] 2520; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm7[0,0],ymm2[7,4],ymm7[4,4] 2521; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,3,2,3] 2522; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3] 2523; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 2524; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,0],ymm7[4,5],ymm2[6,4] 2525; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 2526; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 2527; AVX-NEXT: # xmm5 = mem[0,1,0,1] 2528; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3] 2529; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4] 2530; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7 2531; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,0],xmm5[2,3] 2532; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm6[2,3,0,1] 2533; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm7[0,0],ymm6[7,4],ymm7[4,4] 2534; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm1[2,3,2,3] 2535; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm3[1],xmm7[2,3] 2536; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 2537; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] 2538; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] 2539; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2540; AVX-NEXT: vmovaps %ymm0, (%rsi) 2541; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 2542; AVX-NEXT: vmovaps %ymm6, 32(%rsi) 2543; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2544; AVX-NEXT: vmovaps %ymm0, (%rdx) 2545; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2546; AVX-NEXT: vmovaps %ymm0, 32(%rdx) 2547; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2548; AVX-NEXT: vmovaps %ymm0, (%rcx) 2549; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2550; AVX-NEXT: vmovaps %ymm0, 32(%rcx) 2551; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2552; AVX-NEXT: vmovaps %ymm0, (%r8) 2553; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2554; AVX-NEXT: vmovaps %ymm0, 32(%r8) 2555; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2556; AVX-NEXT: vmovaps %ymm0, (%r9) 2557; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2558; AVX-NEXT: vmovaps %ymm0, 32(%r9) 2559; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 2560; AVX-NEXT: vmovaps %ymm4, (%rax) 2561; AVX-NEXT: vmovaps %ymm15, 32(%rax) 2562; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 2563; AVX-NEXT: vmovaps %ymm5, (%rax) 2564; AVX-NEXT: vmovaps %ymm2, 32(%rax) 2565; AVX-NEXT: addq $456, %rsp # imm = 0x1C8 2566; AVX-NEXT: vzeroupper 2567; AVX-NEXT: retq 2568; 2569; AVX2-LABEL: load_i32_stride7_vf16: 2570; AVX2: # %bb.0: 2571; AVX2-NEXT: subq $264, %rsp # imm = 0x108 2572; AVX2-NEXT: vmovdqa 288(%rdi), %ymm5 2573; AVX2-NEXT: vmovdqa 384(%rdi), %ymm9 2574; AVX2-NEXT: vmovdqa 352(%rdi), %ymm7 2575; AVX2-NEXT: vmovdqa 320(%rdi), %ymm4 2576; AVX2-NEXT: vmovdqa 256(%rdi), %ymm0 2577; AVX2-NEXT: vmovdqa 224(%rdi), %ymm3 2578; AVX2-NEXT: vmovdqa (%rdi), %ymm10 2579; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6 2580; AVX2-NEXT: vmovdqa 96(%rdi), %ymm15 2581; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm1 2582; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] 2583; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,6,0] 2584; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm6[6],ymm10[7] 2585; AVX2-NEXT: vpermd %ymm8, %ymm2, %ymm8 2586; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7] 2587; AVX2-NEXT: vmovdqa 128(%rdi), %xmm8 2588; AVX2-NEXT: vmovdqa 160(%rdi), %xmm11 2589; AVX2-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2590; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm11[1] 2591; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 2592; AVX2-NEXT: vpbroadcastd 196(%rdi), %ymm11 2593; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm11[7] 2594; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] 2595; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2596; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2597; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2598; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7] 2599; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 2600; AVX2-NEXT: vpbroadcastq 304(%rdi), %ymm2 2601; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 2602; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 2603; AVX2-NEXT: vmovdqa 352(%rdi), %xmm2 2604; AVX2-NEXT: vmovdqa 384(%rdi), %xmm8 2605; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2606; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1] 2607; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 2608; AVX2-NEXT: vpbroadcastd 420(%rdi), %ymm8 2609; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7] 2610; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 2611; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2612; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 2613; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 2614; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27] 2615; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 2616; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 2617; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] 2618; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] 2619; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2620; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7] 2621; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,7,6,5,6,5,6] 2622; AVX2-NEXT: vpermd %ymm2, %ymm12, %ymm2 2623; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm1[5,6,7] 2624; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2625; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 2626; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 2627; AVX2-NEXT: vmovdqa 160(%rdi), %ymm3 2628; AVX2-NEXT: vmovdqa 128(%rdi), %ymm2 2629; AVX2-NEXT: vpalignr {{.*#+}} ymm11 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 2630; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,2,0] 2631; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm8[7] 2632; AVX2-NEXT: vmovdqa 64(%rdi), %ymm11 2633; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] 2634; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7] 2635; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6],ymm8[7] 2636; AVX2-NEXT: vpermd %ymm0, %ymm12, %ymm0 2637; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 2638; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2639; AVX2-NEXT: vmovdqa 80(%rdi), %xmm0 2640; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] 2641; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 2642; AVX2-NEXT: vpbroadcastd 8(%rdi), %xmm1 2643; AVX2-NEXT: vmovdqa 32(%rdi), %xmm12 2644; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3] 2645; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 2646; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 2647; AVX2-NEXT: vpbroadcastd 204(%rdi), %ymm14 2648; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] 2649; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 2650; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2651; AVX2-NEXT: vmovdqa 304(%rdi), %xmm0 2652; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] 2653; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 2654; AVX2-NEXT: vpbroadcastd 232(%rdi), %xmm1 2655; AVX2-NEXT: vmovdqa 256(%rdi), %xmm14 2656; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] 2657; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 2658; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] 2659; AVX2-NEXT: vpbroadcastd 428(%rdi), %ymm13 2660; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7] 2661; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 2662; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2663; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm11[1],ymm15[2,3,4,5,6,7] 2664; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] 2665; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] 2666; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 2667; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 2668; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm3[1,3],ymm2[4,6],ymm3[5,7] 2669; AVX2-NEXT: vbroadcastss 208(%rdi), %ymm11 2670; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm11[7] 2671; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 2672; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 2673; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7] 2674; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],mem[3] 2675; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] 2676; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 2677; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 2678; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7] 2679; AVX2-NEXT: vbroadcastss 432(%rdi), %ymm4 2680; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] 2681; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 2682; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2683; AVX2-NEXT: vpbroadcastd 100(%rdi), %xmm0 2684; AVX2-NEXT: vmovdqa 64(%rdi), %xmm1 2685; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 2686; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,3,0,0] 2687; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm10[4,5,6,7] 2688; AVX2-NEXT: vmovdqa %ymm6, %ymm15 2689; AVX2-NEXT: vpermd %ymm5, %ymm4, %ymm5 2690; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3] 2691; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] 2692; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm11 2693; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm3[6,7] 2694; AVX2-NEXT: vpbroadcastd 212(%rdi), %ymm12 2695; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] 2696; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm11[4,5,6,7] 2697; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 2698; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 2699; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm6[4,5,6,7] 2700; AVX2-NEXT: vpermd %ymm5, %ymm4, %ymm4 2701; AVX2-NEXT: vpbroadcastd 324(%rdi), %xmm5 2702; AVX2-NEXT: vmovdqa 288(%rdi), %xmm13 2703; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3] 2704; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] 2705; AVX2-NEXT: vpermd %ymm7, %ymm0, %ymm5 2706; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] 2707; AVX2-NEXT: vpbroadcastd 436(%rdi), %ymm11 2708; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm11[7] 2709; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm5[4,5,6,7] 2710; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] 2711; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] 2712; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] 2713; AVX2-NEXT: vpbroadcastd 216(%rdi), %ymm3 2714; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 2715; AVX2-NEXT: vmovdqa 96(%rdi), %xmm3 2716; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] 2717; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] 2718; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[1,0,2,3,5,4,6,7] 2719; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 2720; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] 2721; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 2722; AVX2-NEXT: vmovdqa 320(%rdi), %xmm8 2723; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1,2],xmm13[3] 2724; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] 2725; AVX2-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 2726; AVX2-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] 2727; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 2728; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] 2729; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] 2730; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] 2731; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] 2732; AVX2-NEXT: vpbroadcastd 440(%rdi), %ymm5 2733; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] 2734; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 2735; AVX2-NEXT: vpbroadcastd 136(%rdi), %xmm4 2736; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 2737; AVX2-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] 2738; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 2739; AVX2-NEXT: vpermd 192(%rdi), %ymm0, %ymm5 2740; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 2741; AVX2-NEXT: vpbroadcastd 80(%rdi), %ymm5 2742; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] 2743; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[2,3,2,3,6,7,6,7] 2744; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] 2745; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 2746; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] 2747; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] 2748; AVX2-NEXT: vpbroadcastd 360(%rdi), %xmm4 2749; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 2750; AVX2-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] 2751; AVX2-NEXT: vpermd 416(%rdi), %ymm0, %ymm0 2752; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 2753; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] 2754; AVX2-NEXT: vpbroadcastd 304(%rdi), %ymm4 2755; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3] 2756; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7] 2757; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] 2758; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 2759; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] 2760; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] 2761; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2762; AVX2-NEXT: vmovaps %ymm4, 32(%rsi) 2763; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2764; AVX2-NEXT: vmovaps %ymm4, (%rsi) 2765; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2766; AVX2-NEXT: vmovaps %ymm4, 32(%rdx) 2767; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2768; AVX2-NEXT: vmovaps %ymm4, (%rdx) 2769; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2770; AVX2-NEXT: vmovaps %ymm4, 32(%rcx) 2771; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2772; AVX2-NEXT: vmovaps %ymm4, (%rcx) 2773; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2774; AVX2-NEXT: vmovaps %ymm4, 32(%r8) 2775; AVX2-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload 2776; AVX2-NEXT: vmovaps %ymm4, (%r8) 2777; AVX2-NEXT: vmovdqa %ymm11, 32(%r9) 2778; AVX2-NEXT: vmovdqa %ymm12, (%r9) 2779; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 2780; AVX2-NEXT: vmovdqa %ymm2, 32(%rax) 2781; AVX2-NEXT: vmovdqa %ymm1, (%rax) 2782; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 2783; AVX2-NEXT: vmovdqa %ymm0, 32(%rax) 2784; AVX2-NEXT: vmovdqa %ymm3, (%rax) 2785; AVX2-NEXT: addq $264, %rsp # imm = 0x108 2786; AVX2-NEXT: vzeroupper 2787; AVX2-NEXT: retq 2788; 2789; AVX2-FP-LABEL: load_i32_stride7_vf16: 2790; AVX2-FP: # %bb.0: 2791; AVX2-FP-NEXT: subq $264, %rsp # imm = 0x108 2792; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm5 2793; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm9 2794; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm7 2795; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm4 2796; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm0 2797; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm3 2798; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm10 2799; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm6 2800; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm15 2801; AVX2-FP-NEXT: vpbroadcastq 80(%rdi), %ymm1 2802; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] 2803; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,6,0] 2804; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm6[6],ymm10[7] 2805; AVX2-FP-NEXT: vpermd %ymm8, %ymm2, %ymm8 2806; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7] 2807; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm8 2808; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm11 2809; AVX2-FP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2810; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm11[1] 2811; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 2812; AVX2-FP-NEXT: vpbroadcastd 196(%rdi), %ymm11 2813; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm11[7] 2814; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] 2815; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2816; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2817; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2818; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7] 2819; AVX2-FP-NEXT: vpermd %ymm1, %ymm2, %ymm1 2820; AVX2-FP-NEXT: vpbroadcastq 304(%rdi), %ymm2 2821; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 2822; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 2823; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm2 2824; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm8 2825; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2826; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1] 2827; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 2828; AVX2-FP-NEXT: vpbroadcastd 420(%rdi), %ymm8 2829; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7] 2830; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 2831; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2832; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 2833; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 2834; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27] 2835; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 2836; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 2837; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] 2838; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] 2839; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2840; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7] 2841; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,7,6,5,6,5,6] 2842; AVX2-FP-NEXT: vpermd %ymm2, %ymm12, %ymm2 2843; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm1[5,6,7] 2844; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2845; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 2846; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 2847; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm3 2848; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm2 2849; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm11 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 2850; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,2,0] 2851; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm8[7] 2852; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm11 2853; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] 2854; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7] 2855; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6],ymm8[7] 2856; AVX2-FP-NEXT: vpermd %ymm0, %ymm12, %ymm0 2857; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 2858; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2859; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm0 2860; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] 2861; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 2862; AVX2-FP-NEXT: vpbroadcastd 8(%rdi), %xmm1 2863; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm12 2864; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3] 2865; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 2866; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 2867; AVX2-FP-NEXT: vpbroadcastd 204(%rdi), %ymm14 2868; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] 2869; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 2870; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2871; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm0 2872; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] 2873; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 2874; AVX2-FP-NEXT: vpbroadcastd 232(%rdi), %xmm1 2875; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm14 2876; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] 2877; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 2878; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] 2879; AVX2-FP-NEXT: vpbroadcastd 428(%rdi), %ymm13 2880; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7] 2881; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 2882; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2883; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm11[1],ymm15[2,3,4,5,6,7] 2884; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] 2885; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] 2886; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 2887; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 2888; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm3[1,3],ymm2[4,6],ymm3[5,7] 2889; AVX2-FP-NEXT: vbroadcastss 208(%rdi), %ymm11 2890; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm11[7] 2891; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 2892; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 2893; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7] 2894; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],mem[3] 2895; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] 2896; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 2897; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 2898; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7] 2899; AVX2-FP-NEXT: vbroadcastss 432(%rdi), %ymm4 2900; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] 2901; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 2902; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2903; AVX2-FP-NEXT: vpbroadcastd 100(%rdi), %xmm0 2904; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm1 2905; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 2906; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,3,0,0] 2907; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm10[4,5,6,7] 2908; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm15 2909; AVX2-FP-NEXT: vpermd %ymm5, %ymm4, %ymm5 2910; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3] 2911; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] 2912; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm11 2913; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm3[6,7] 2914; AVX2-FP-NEXT: vpbroadcastd 212(%rdi), %ymm12 2915; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] 2916; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm11[4,5,6,7] 2917; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 2918; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 2919; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm6[4,5,6,7] 2920; AVX2-FP-NEXT: vpermd %ymm5, %ymm4, %ymm4 2921; AVX2-FP-NEXT: vpbroadcastd 324(%rdi), %xmm5 2922; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm13 2923; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3] 2924; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] 2925; AVX2-FP-NEXT: vpermd %ymm7, %ymm0, %ymm5 2926; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] 2927; AVX2-FP-NEXT: vpbroadcastd 436(%rdi), %ymm11 2928; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm11[7] 2929; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm5[4,5,6,7] 2930; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] 2931; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] 2932; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] 2933; AVX2-FP-NEXT: vpbroadcastd 216(%rdi), %ymm3 2934; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 2935; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm3 2936; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] 2937; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] 2938; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[1,0,2,3,5,4,6,7] 2939; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm4 2940; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] 2941; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 2942; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm8 2943; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1,2],xmm13[3] 2944; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] 2945; AVX2-FP-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 2946; AVX2-FP-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] 2947; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm4 2948; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] 2949; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] 2950; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] 2951; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] 2952; AVX2-FP-NEXT: vpbroadcastd 440(%rdi), %ymm5 2953; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] 2954; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 2955; AVX2-FP-NEXT: vpbroadcastd 136(%rdi), %xmm4 2956; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 2957; AVX2-FP-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] 2958; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 2959; AVX2-FP-NEXT: vpermd 192(%rdi), %ymm0, %ymm5 2960; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 2961; AVX2-FP-NEXT: vpbroadcastd 80(%rdi), %ymm5 2962; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] 2963; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[2,3,2,3,6,7,6,7] 2964; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] 2965; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 2966; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] 2967; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] 2968; AVX2-FP-NEXT: vpbroadcastd 360(%rdi), %xmm4 2969; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 2970; AVX2-FP-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] 2971; AVX2-FP-NEXT: vpermd 416(%rdi), %ymm0, %ymm0 2972; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 2973; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] 2974; AVX2-FP-NEXT: vpbroadcastd 304(%rdi), %ymm4 2975; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3] 2976; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7] 2977; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] 2978; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 2979; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] 2980; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] 2981; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2982; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi) 2983; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2984; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) 2985; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2986; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx) 2987; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2988; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx) 2989; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2990; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx) 2991; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2992; AVX2-FP-NEXT: vmovaps %ymm4, (%rcx) 2993; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2994; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r8) 2995; AVX2-FP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload 2996; AVX2-FP-NEXT: vmovaps %ymm4, (%r8) 2997; AVX2-FP-NEXT: vmovdqa %ymm11, 32(%r9) 2998; AVX2-FP-NEXT: vmovdqa %ymm12, (%r9) 2999; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3000; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rax) 3001; AVX2-FP-NEXT: vmovdqa %ymm1, (%rax) 3002; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3003; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rax) 3004; AVX2-FP-NEXT: vmovdqa %ymm3, (%rax) 3005; AVX2-FP-NEXT: addq $264, %rsp # imm = 0x108 3006; AVX2-FP-NEXT: vzeroupper 3007; AVX2-FP-NEXT: retq 3008; 3009; AVX2-FCP-LABEL: load_i32_stride7_vf16: 3010; AVX2-FCP: # %bb.0: 3011; AVX2-FCP-NEXT: subq $264, %rsp # imm = 0x108 3012; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm5 3013; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm9 3014; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm7 3015; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm4 3016; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm0 3017; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm3 3018; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm10 3019; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 3020; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm15 3021; AVX2-FCP-NEXT: vpbroadcastq 80(%rdi), %ymm1 3022; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] 3023; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,6,0] 3024; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm6[6],ymm10[7] 3025; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm8 3026; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7] 3027; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm8 3028; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm11 3029; AVX2-FCP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3030; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm11[1] 3031; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 3032; AVX2-FCP-NEXT: vpbroadcastd 196(%rdi), %ymm11 3033; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm11[7] 3034; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] 3035; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3036; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3037; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3038; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7] 3039; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 3040; AVX2-FCP-NEXT: vpbroadcastq 304(%rdi), %ymm2 3041; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 3042; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 3043; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm2 3044; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm8 3045; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3046; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1] 3047; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 3048; AVX2-FCP-NEXT: vpbroadcastd 420(%rdi), %ymm8 3049; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7] 3050; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 3051; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3052; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 3053; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 3054; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27] 3055; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 3056; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 3057; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] 3058; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] 3059; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3060; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7] 3061; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,7,6,5,6,5,6] 3062; AVX2-FCP-NEXT: vpermd %ymm2, %ymm12, %ymm2 3063; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm1[5,6,7] 3064; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3065; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 3066; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 3067; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm3 3068; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 3069; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 3070; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,2,0] 3071; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm8[7] 3072; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 3073; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] 3074; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7] 3075; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6],ymm8[7] 3076; AVX2-FCP-NEXT: vpermd %ymm0, %ymm12, %ymm0 3077; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 3078; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3079; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 3080; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] 3081; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 3082; AVX2-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm1 3083; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 3084; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3] 3085; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 3086; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] 3087; AVX2-FCP-NEXT: vpbroadcastd 204(%rdi), %ymm14 3088; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] 3089; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 3090; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3091; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm0 3092; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] 3093; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 3094; AVX2-FCP-NEXT: vpbroadcastd 232(%rdi), %xmm1 3095; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm14 3096; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] 3097; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 3098; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] 3099; AVX2-FCP-NEXT: vpbroadcastd 428(%rdi), %ymm13 3100; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7] 3101; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 3102; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3103; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm11[1],ymm15[2,3,4,5,6,7] 3104; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] 3105; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] 3106; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 3107; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 3108; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm3[1,3],ymm2[4,6],ymm3[5,7] 3109; AVX2-FCP-NEXT: vbroadcastss 208(%rdi), %ymm11 3110; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm11[7] 3111; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 3112; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 3113; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7] 3114; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],mem[3] 3115; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] 3116; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 3117; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 3118; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7] 3119; AVX2-FCP-NEXT: vbroadcastss 432(%rdi), %ymm4 3120; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] 3121; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 3122; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3123; AVX2-FCP-NEXT: vpbroadcastd 100(%rdi), %xmm0 3124; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 3125; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 3126; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,3,0,0] 3127; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm10[4,5,6,7] 3128; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm15 3129; AVX2-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm5 3130; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3] 3131; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] 3132; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm11 3133; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm3[6,7] 3134; AVX2-FCP-NEXT: vpbroadcastd 212(%rdi), %ymm12 3135; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] 3136; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm11[4,5,6,7] 3137; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 3138; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 3139; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm6[4,5,6,7] 3140; AVX2-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm4 3141; AVX2-FCP-NEXT: vpbroadcastd 324(%rdi), %xmm5 3142; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm13 3143; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3] 3144; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] 3145; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm5 3146; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] 3147; AVX2-FCP-NEXT: vpbroadcastd 436(%rdi), %ymm11 3148; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm11[7] 3149; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm5[4,5,6,7] 3150; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] 3151; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,3,3,1,0,7,7] 3152; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 3153; AVX2-FCP-NEXT: vpbroadcastd 216(%rdi), %ymm4 3154; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] 3155; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm4 3156; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] 3157; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] 3158; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[1,0,2,3,5,4,6,7] 3159; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 3160; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] 3161; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3162; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm8 3163; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1,2],xmm13[3] 3164; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] 3165; AVX2-FCP-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 3166; AVX2-FCP-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] 3167; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 3168; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] 3169; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] 3170; AVX2-FCP-NEXT: vpermd %ymm5, %ymm3, %ymm3 3171; AVX2-FCP-NEXT: vpbroadcastd 440(%rdi), %ymm5 3172; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] 3173; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 3174; AVX2-FCP-NEXT: vpbroadcastd 136(%rdi), %xmm3 3175; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 3176; AVX2-FCP-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] 3177; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 3178; AVX2-FCP-NEXT: vpermd 192(%rdi), %ymm0, %ymm5 3179; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] 3180; AVX2-FCP-NEXT: vpbroadcastd 80(%rdi), %ymm5 3181; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] 3182; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[2,3,2,3,6,7,6,7] 3183; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] 3184; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 3185; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] 3186; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 3187; AVX2-FCP-NEXT: vpbroadcastd 360(%rdi), %xmm4 3188; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 3189; AVX2-FCP-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] 3190; AVX2-FCP-NEXT: vpermd 416(%rdi), %ymm0, %ymm0 3191; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3192; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] 3193; AVX2-FCP-NEXT: vpbroadcastd 304(%rdi), %ymm4 3194; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3] 3195; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7] 3196; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] 3197; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 3198; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] 3199; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] 3200; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3201; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi) 3202; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3203; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) 3204; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3205; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rdx) 3206; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3207; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx) 3208; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3209; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rcx) 3210; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3211; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx) 3212; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3213; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r8) 3214; AVX2-FCP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload 3215; AVX2-FCP-NEXT: vmovaps %ymm4, (%r8) 3216; AVX2-FCP-NEXT: vmovdqa %ymm11, 32(%r9) 3217; AVX2-FCP-NEXT: vmovdqa %ymm12, (%r9) 3218; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3219; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rax) 3220; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rax) 3221; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3222; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rax) 3223; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rax) 3224; AVX2-FCP-NEXT: addq $264, %rsp # imm = 0x108 3225; AVX2-FCP-NEXT: vzeroupper 3226; AVX2-FCP-NEXT: retq 3227; 3228; AVX512-LABEL: load_i32_stride7_vf16: 3229; AVX512: # %bb.0: 3230; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 3231; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 3232; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm1 3233; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm5 3234; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm4 3235; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 3236; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 3237; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm8 3238; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm7 3239; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 3240; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 3241; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 3242; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] 3243; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 3244; AVX512-NEXT: movw $992, %di # imm = 0x3E0 3245; AVX512-NEXT: kmovw %edi, %k1 3246; AVX512-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} 3247; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 3248; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3249; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 3250; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 3251; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3252; AVX512-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 3253; AVX512-NEXT: movb $-32, %dil 3254; AVX512-NEXT: kmovw %edi, %k1 3255; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} 3256; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 3257; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3258; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 3259; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 3260; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3261; AVX512-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 3262; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 3263; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 3264; AVX512-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 3265; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 3266; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 3267; AVX512-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 3268; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 3269; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 3270; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 3271; AVX512-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 3272; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] 3273; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 3274; AVX512-NEXT: movw $480, %di # imm = 0x1E0 3275; AVX512-NEXT: kmovw %edi, %k2 3276; AVX512-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} 3277; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 3278; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3279; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 3280; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 3281; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3282; AVX512-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 3283; AVX512-NEXT: movw $-512, %di # imm = 0xFE00 3284; AVX512-NEXT: kmovw %edi, %k1 3285; AVX512-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} 3286; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 3287; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3288; AVX512-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 3289; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] 3290; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 3291; AVX512-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} 3292; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 3293; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3294; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 3295; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 3296; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3297; AVX512-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 3298; AVX512-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} 3299; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] 3300; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 3301; AVX512-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} 3302; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 3303; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 3304; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 3305; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 3306; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3307; AVX512-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 3308; AVX512-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} 3309; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 3310; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 3311; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 3312; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 3313; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3314; AVX512-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 3315; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] 3316; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 3317; AVX512-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 3318; AVX512-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} 3319; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 3320; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 3321; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 3322; AVX512-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 3323; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] 3324; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 3325; AVX512-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 3326; AVX512-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} 3327; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 3328; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 3329; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 3330; AVX512-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 3331; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] 3332; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 3333; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 3334; AVX512-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} 3335; AVX512-NEXT: vmovdqa64 %zmm3, (%rsi) 3336; AVX512-NEXT: vmovdqa64 %zmm14, (%rdx) 3337; AVX512-NEXT: vmovdqa64 %zmm7, (%rcx) 3338; AVX512-NEXT: vmovdqa64 %zmm9, (%r8) 3339; AVX512-NEXT: vmovdqa64 %zmm10, (%r9) 3340; AVX512-NEXT: vmovdqa64 %zmm6, (%r10) 3341; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) 3342; AVX512-NEXT: vzeroupper 3343; AVX512-NEXT: retq 3344; 3345; AVX512-FCP-LABEL: load_i32_stride7_vf16: 3346; AVX512-FCP: # %bb.0: 3347; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3348; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 3349; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 3350; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 3351; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 3352; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 3353; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 3354; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 3355; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 3356; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 3357; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 3358; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 3359; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] 3360; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 3361; AVX512-FCP-NEXT: movw $992, %di # imm = 0x3E0 3362; AVX512-FCP-NEXT: kmovw %edi, %k1 3363; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} 3364; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 3365; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3366; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 3367; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 3368; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3369; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 3370; AVX512-FCP-NEXT: movb $-32, %dil 3371; AVX512-FCP-NEXT: kmovw %edi, %k1 3372; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} 3373; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 3374; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3375; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 3376; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 3377; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3378; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 3379; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 3380; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 3381; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 3382; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 3383; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 3384; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 3385; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 3386; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 3387; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 3388; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 3389; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] 3390; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 3391; AVX512-FCP-NEXT: movw $480, %di # imm = 0x1E0 3392; AVX512-FCP-NEXT: kmovw %edi, %k2 3393; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} 3394; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 3395; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3396; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 3397; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 3398; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3399; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 3400; AVX512-FCP-NEXT: movw $-512, %di # imm = 0xFE00 3401; AVX512-FCP-NEXT: kmovw %edi, %k1 3402; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} 3403; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 3404; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3405; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 3406; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] 3407; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 3408; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} 3409; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 3410; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3411; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 3412; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 3413; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3414; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 3415; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} 3416; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] 3417; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 3418; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} 3419; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 3420; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 3421; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 3422; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 3423; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3424; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 3425; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} 3426; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 3427; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 3428; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 3429; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 3430; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3431; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 3432; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] 3433; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 3434; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 3435; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} 3436; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 3437; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 3438; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 3439; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 3440; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] 3441; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 3442; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 3443; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} 3444; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 3445; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 3446; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 3447; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 3448; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] 3449; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 3450; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 3451; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} 3452; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rsi) 3453; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) 3454; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) 3455; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%r8) 3456; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%r9) 3457; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r10) 3458; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 3459; AVX512-FCP-NEXT: vzeroupper 3460; AVX512-FCP-NEXT: retq 3461; 3462; AVX512DQ-LABEL: load_i32_stride7_vf16: 3463; AVX512DQ: # %bb.0: 3464; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 3465; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 3466; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm1 3467; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm5 3468; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm4 3469; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 3470; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2 3471; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm8 3472; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm7 3473; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 3474; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 3475; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 3476; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] 3477; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 3478; AVX512DQ-NEXT: movw $992, %di # imm = 0x3E0 3479; AVX512DQ-NEXT: kmovw %edi, %k1 3480; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} 3481; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 3482; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3483; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 3484; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 3485; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3486; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 3487; AVX512DQ-NEXT: movb $-32, %dil 3488; AVX512DQ-NEXT: kmovw %edi, %k1 3489; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} 3490; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 3491; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3492; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 3493; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 3494; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3495; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 3496; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 3497; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 3498; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 3499; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 3500; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 3501; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 3502; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 3503; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 3504; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 3505; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 3506; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] 3507; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 3508; AVX512DQ-NEXT: movw $480, %di # imm = 0x1E0 3509; AVX512DQ-NEXT: kmovw %edi, %k2 3510; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} 3511; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 3512; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3513; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 3514; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 3515; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3516; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 3517; AVX512DQ-NEXT: movw $-512, %di # imm = 0xFE00 3518; AVX512DQ-NEXT: kmovw %edi, %k1 3519; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} 3520; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 3521; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3522; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 3523; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] 3524; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 3525; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} 3526; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 3527; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3528; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 3529; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 3530; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3531; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 3532; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} 3533; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] 3534; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 3535; AVX512DQ-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} 3536; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 3537; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 3538; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 3539; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 3540; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3541; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 3542; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} 3543; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 3544; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 3545; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 3546; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 3547; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3548; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 3549; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] 3550; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 3551; AVX512DQ-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 3552; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} 3553; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 3554; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 3555; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 3556; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 3557; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] 3558; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 3559; AVX512DQ-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 3560; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} 3561; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 3562; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 3563; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 3564; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 3565; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] 3566; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 3567; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 3568; AVX512DQ-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} 3569; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rsi) 3570; AVX512DQ-NEXT: vmovdqa64 %zmm14, (%rdx) 3571; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rcx) 3572; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%r8) 3573; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%r9) 3574; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%r10) 3575; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) 3576; AVX512DQ-NEXT: vzeroupper 3577; AVX512DQ-NEXT: retq 3578; 3579; AVX512DQ-FCP-LABEL: load_i32_stride7_vf16: 3580; AVX512DQ-FCP: # %bb.0: 3581; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3582; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 3583; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 3584; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 3585; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 3586; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 3587; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 3588; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 3589; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 3590; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 3591; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 3592; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 3593; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] 3594; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 3595; AVX512DQ-FCP-NEXT: movw $992, %di # imm = 0x3E0 3596; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 3597; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} 3598; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 3599; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3600; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 3601; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 3602; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3603; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 3604; AVX512DQ-FCP-NEXT: movb $-32, %dil 3605; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 3606; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} 3607; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 3608; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3609; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 3610; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 3611; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3612; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 3613; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 3614; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 3615; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 3616; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 3617; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 3618; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 3619; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 3620; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 3621; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 3622; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 3623; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] 3624; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 3625; AVX512DQ-FCP-NEXT: movw $480, %di # imm = 0x1E0 3626; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 3627; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} 3628; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 3629; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3630; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 3631; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 3632; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3633; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 3634; AVX512DQ-FCP-NEXT: movw $-512, %di # imm = 0xFE00 3635; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 3636; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} 3637; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 3638; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3639; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 3640; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] 3641; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 3642; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} 3643; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 3644; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3645; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 3646; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 3647; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3648; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 3649; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} 3650; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] 3651; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 3652; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} 3653; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 3654; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 3655; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 3656; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 3657; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3658; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 3659; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} 3660; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 3661; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 3662; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 3663; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 3664; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3665; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 3666; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] 3667; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 3668; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 3669; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} 3670; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 3671; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 3672; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 3673; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 3674; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] 3675; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 3676; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 3677; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} 3678; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 3679; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 3680; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 3681; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 3682; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] 3683; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 3684; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 3685; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} 3686; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rsi) 3687; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) 3688; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) 3689; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%r8) 3690; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%r9) 3691; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r10) 3692; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 3693; AVX512DQ-FCP-NEXT: vzeroupper 3694; AVX512DQ-FCP-NEXT: retq 3695; 3696; AVX512BW-LABEL: load_i32_stride7_vf16: 3697; AVX512BW: # %bb.0: 3698; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 3699; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 3700; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 3701; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 3702; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 3703; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 3704; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 3705; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm8 3706; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 3707; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 3708; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 3709; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 3710; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] 3711; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 3712; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 3713; AVX512BW-NEXT: kmovd %edi, %k1 3714; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} 3715; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 3716; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3717; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 3718; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 3719; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3720; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 3721; AVX512BW-NEXT: movb $-32, %dil 3722; AVX512BW-NEXT: kmovd %edi, %k1 3723; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} 3724; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 3725; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3726; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 3727; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 3728; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3729; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 3730; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 3731; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 3732; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 3733; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 3734; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 3735; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 3736; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 3737; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 3738; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 3739; AVX512BW-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 3740; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] 3741; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 3742; AVX512BW-NEXT: movw $480, %di # imm = 0x1E0 3743; AVX512BW-NEXT: kmovd %edi, %k2 3744; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} 3745; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 3746; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3747; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 3748; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 3749; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3750; AVX512BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 3751; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00 3752; AVX512BW-NEXT: kmovd %edi, %k1 3753; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} 3754; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 3755; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3756; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 3757; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] 3758; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 3759; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} 3760; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 3761; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3762; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 3763; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 3764; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3765; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 3766; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} 3767; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] 3768; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 3769; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} 3770; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 3771; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 3772; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 3773; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 3774; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3775; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 3776; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} 3777; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 3778; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 3779; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 3780; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 3781; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3782; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 3783; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] 3784; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 3785; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 3786; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} 3787; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 3788; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 3789; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 3790; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 3791; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] 3792; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 3793; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 3794; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} 3795; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 3796; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 3797; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 3798; AVX512BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 3799; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] 3800; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 3801; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 3802; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} 3803; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rsi) 3804; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) 3805; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx) 3806; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) 3807; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r9) 3808; AVX512BW-NEXT: vmovdqa64 %zmm6, (%r10) 3809; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) 3810; AVX512BW-NEXT: vzeroupper 3811; AVX512BW-NEXT: retq 3812; 3813; AVX512BW-FCP-LABEL: load_i32_stride7_vf16: 3814; AVX512BW-FCP: # %bb.0: 3815; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3816; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 3817; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 3818; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 3819; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 3820; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 3821; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 3822; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 3823; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 3824; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 3825; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 3826; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 3827; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] 3828; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 3829; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 3830; AVX512BW-FCP-NEXT: kmovd %edi, %k1 3831; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} 3832; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 3833; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3834; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 3835; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 3836; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3837; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 3838; AVX512BW-FCP-NEXT: movb $-32, %dil 3839; AVX512BW-FCP-NEXT: kmovd %edi, %k1 3840; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} 3841; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 3842; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3843; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 3844; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 3845; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3846; AVX512BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 3847; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 3848; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 3849; AVX512BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 3850; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 3851; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 3852; AVX512BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 3853; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 3854; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 3855; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 3856; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 3857; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] 3858; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 3859; AVX512BW-FCP-NEXT: movw $480, %di # imm = 0x1E0 3860; AVX512BW-FCP-NEXT: kmovd %edi, %k2 3861; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} 3862; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 3863; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3864; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 3865; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 3866; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3867; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 3868; AVX512BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00 3869; AVX512BW-FCP-NEXT: kmovd %edi, %k1 3870; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} 3871; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 3872; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3873; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 3874; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] 3875; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 3876; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} 3877; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 3878; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3879; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 3880; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 3881; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3882; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 3883; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} 3884; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] 3885; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 3886; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} 3887; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 3888; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 3889; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 3890; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 3891; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3892; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 3893; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} 3894; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 3895; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 3896; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 3897; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 3898; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3899; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 3900; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] 3901; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 3902; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 3903; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} 3904; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 3905; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 3906; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 3907; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 3908; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] 3909; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 3910; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 3911; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} 3912; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 3913; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 3914; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 3915; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 3916; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] 3917; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 3918; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 3919; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} 3920; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rsi) 3921; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) 3922; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) 3923; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%r8) 3924; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%r9) 3925; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%r10) 3926; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 3927; AVX512BW-FCP-NEXT: vzeroupper 3928; AVX512BW-FCP-NEXT: retq 3929; 3930; AVX512DQ-BW-LABEL: load_i32_stride7_vf16: 3931; AVX512DQ-BW: # %bb.0: 3932; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 3933; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 3934; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm1 3935; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm5 3936; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm4 3937; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 3938; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 3939; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm8 3940; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm7 3941; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 3942; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 3943; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 3944; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] 3945; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 3946; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 3947; AVX512DQ-BW-NEXT: kmovd %edi, %k1 3948; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} 3949; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 3950; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3951; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 3952; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 3953; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3954; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 3955; AVX512DQ-BW-NEXT: movb $-32, %dil 3956; AVX512DQ-BW-NEXT: kmovd %edi, %k1 3957; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} 3958; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 3959; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3960; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 3961; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 3962; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3963; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 3964; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 3965; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 3966; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 3967; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 3968; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 3969; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 3970; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 3971; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 3972; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 3973; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 3974; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] 3975; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 3976; AVX512DQ-BW-NEXT: movw $480, %di # imm = 0x1E0 3977; AVX512DQ-BW-NEXT: kmovd %edi, %k2 3978; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} 3979; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 3980; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3981; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 3982; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 3983; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3984; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 3985; AVX512DQ-BW-NEXT: movw $-512, %di # imm = 0xFE00 3986; AVX512DQ-BW-NEXT: kmovd %edi, %k1 3987; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} 3988; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 3989; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3990; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 3991; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] 3992; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 3993; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} 3994; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 3995; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3996; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 3997; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 3998; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 3999; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 4000; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} 4001; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] 4002; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 4003; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} 4004; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 4005; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 4006; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 4007; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 4008; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 4009; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 4010; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} 4011; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 4012; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 4013; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 4014; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 4015; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 4016; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 4017; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] 4018; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 4019; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 4020; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} 4021; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 4022; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 4023; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 4024; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 4025; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] 4026; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 4027; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 4028; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} 4029; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 4030; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 4031; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 4032; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 4033; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] 4034; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 4035; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 4036; AVX512DQ-BW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} 4037; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rsi) 4038; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rdx) 4039; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rcx) 4040; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%r8) 4041; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%r9) 4042; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%r10) 4043; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) 4044; AVX512DQ-BW-NEXT: vzeroupper 4045; AVX512DQ-BW-NEXT: retq 4046; 4047; AVX512DQ-BW-FCP-LABEL: load_i32_stride7_vf16: 4048; AVX512DQ-BW-FCP: # %bb.0: 4049; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4050; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 4051; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 4052; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 4053; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 4054; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 4055; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 4056; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 4057; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 4058; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 4059; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 4060; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 4061; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] 4062; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 4063; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 4064; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 4065; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} 4066; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 4067; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4068; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 4069; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 4070; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 4071; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 4072; AVX512DQ-BW-FCP-NEXT: movb $-32, %dil 4073; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 4074; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} 4075; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 4076; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4077; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 4078; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 4079; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4080; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 4081; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 4082; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 4083; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 4084; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 4085; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 4086; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 4087; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 4088; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 4089; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 4090; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 4091; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] 4092; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 4093; AVX512DQ-BW-FCP-NEXT: movw $480, %di # imm = 0x1E0 4094; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 4095; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} 4096; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 4097; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 4098; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 4099; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 4100; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 4101; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 4102; AVX512DQ-BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00 4103; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 4104; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} 4105; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 4106; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4107; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 4108; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] 4109; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 4110; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} 4111; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 4112; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 4113; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 4114; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 4115; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 4116; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 4117; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} 4118; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] 4119; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 4120; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} 4121; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 4122; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 4123; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 4124; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 4125; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 4126; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 4127; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} 4128; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 4129; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 4130; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 4131; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 4132; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 4133; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 4134; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] 4135; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 4136; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 4137; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} 4138; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 4139; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 4140; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 4141; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 4142; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] 4143; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 4144; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 4145; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} 4146; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 4147; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 4148; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 4149; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 4150; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] 4151; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 4152; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 4153; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} 4154; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rsi) 4155; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) 4156; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) 4157; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%r8) 4158; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%r9) 4159; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%r10) 4160; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 4161; AVX512DQ-BW-FCP-NEXT: vzeroupper 4162; AVX512DQ-BW-FCP-NEXT: retq 4163 %wide.vec = load <112 x i32>, ptr %in.vec, align 64 4164 %strided.vec0 = shufflevector <112 x i32> %wide.vec, <112 x i32> poison, <16 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105> 4165 %strided.vec1 = shufflevector <112 x i32> %wide.vec, <112 x i32> poison, <16 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106> 4166 %strided.vec2 = shufflevector <112 x i32> %wide.vec, <112 x i32> poison, <16 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107> 4167 %strided.vec3 = shufflevector <112 x i32> %wide.vec, <112 x i32> poison, <16 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108> 4168 %strided.vec4 = shufflevector <112 x i32> %wide.vec, <112 x i32> poison, <16 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109> 4169 %strided.vec5 = shufflevector <112 x i32> %wide.vec, <112 x i32> poison, <16 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110> 4170 %strided.vec6 = shufflevector <112 x i32> %wide.vec, <112 x i32> poison, <16 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111> 4171 store <16 x i32> %strided.vec0, ptr %out.vec0, align 64 4172 store <16 x i32> %strided.vec1, ptr %out.vec1, align 64 4173 store <16 x i32> %strided.vec2, ptr %out.vec2, align 64 4174 store <16 x i32> %strided.vec3, ptr %out.vec3, align 64 4175 store <16 x i32> %strided.vec4, ptr %out.vec4, align 64 4176 store <16 x i32> %strided.vec5, ptr %out.vec5, align 64 4177 store <16 x i32> %strided.vec6, ptr %out.vec6, align 64 4178 ret void 4179} 4180 4181define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { 4182; SSE-LABEL: load_i32_stride7_vf32: 4183; SSE: # %bb.0: 4184; SSE-NEXT: subq $1160, %rsp # imm = 0x488 4185; SSE-NEXT: movdqa 80(%rdi), %xmm8 4186; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4187; SSE-NEXT: movdqa (%rdi), %xmm13 4188; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4189; SSE-NEXT: movdqa 16(%rdi), %xmm6 4190; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4191; SSE-NEXT: movdqa 48(%rdi), %xmm5 4192; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4193; SSE-NEXT: movdqa 640(%rdi), %xmm3 4194; SSE-NEXT: movdqa 608(%rdi), %xmm4 4195; SSE-NEXT: movdqa 560(%rdi), %xmm10 4196; SSE-NEXT: movdqa 576(%rdi), %xmm1 4197; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4198; SSE-NEXT: movdqa 192(%rdi), %xmm14 4199; SSE-NEXT: movdqa 160(%rdi), %xmm12 4200; SSE-NEXT: movdqa 112(%rdi), %xmm2 4201; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4202; SSE-NEXT: movdqa 128(%rdi), %xmm0 4203; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4204; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 4205; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 4206; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] 4207; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4208; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] 4209; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 4210; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4211; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 4212; SSE-NEXT: movdqa %xmm10, %xmm2 4213; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4214; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 4215; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] 4216; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4217; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 4218; SSE-NEXT: movdqa %xmm3, %xmm7 4219; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4220; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 4221; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4222; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] 4223; SSE-NEXT: movdqa %xmm13, %xmm2 4224; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 4225; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] 4226; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] 4227; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 4228; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4229; SSE-NEXT: movdqa 448(%rdi), %xmm2 4230; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4231; SSE-NEXT: movdqa 464(%rdi), %xmm0 4232; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4233; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 4234; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 4235; SSE-NEXT: movdqa 528(%rdi), %xmm9 4236; SSE-NEXT: movdqa 496(%rdi), %xmm13 4237; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] 4238; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4239; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 4240; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4241; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 4242; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4243; SSE-NEXT: movdqa 336(%rdi), %xmm2 4244; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4245; SSE-NEXT: movdqa 352(%rdi), %xmm0 4246; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4247; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 4248; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 4249; SSE-NEXT: movdqa 416(%rdi), %xmm3 4250; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4251; SSE-NEXT: movdqa 384(%rdi), %xmm11 4252; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] 4253; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4254; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 4255; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 4256; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4257; SSE-NEXT: movdqa 784(%rdi), %xmm2 4258; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4259; SSE-NEXT: movdqa 800(%rdi), %xmm0 4260; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4261; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 4262; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 4263; SSE-NEXT: movdqa 864(%rdi), %xmm8 4264; SSE-NEXT: movdqa 832(%rdi), %xmm15 4265; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] 4266; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4267; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] 4268; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4269; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 4270; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4271; SSE-NEXT: movdqa 224(%rdi), %xmm3 4272; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill 4273; SSE-NEXT: movdqa 240(%rdi), %xmm0 4274; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4275; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 4276; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 4277; SSE-NEXT: movdqa 304(%rdi), %xmm1 4278; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4279; SSE-NEXT: movdqa 272(%rdi), %xmm6 4280; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] 4281; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4282; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4283; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 4284; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4285; SSE-NEXT: movdqa 672(%rdi), %xmm3 4286; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4287; SSE-NEXT: movdqa 688(%rdi), %xmm0 4288; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4289; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 4290; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 4291; SSE-NEXT: movdqa 752(%rdi), %xmm1 4292; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4293; SSE-NEXT: movdqa 720(%rdi), %xmm0 4294; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4295; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 4296; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4297; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 4298; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4299; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4300; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] 4301; SSE-NEXT: movdqa %xmm12, %xmm3 4302; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] 4303; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4304; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] 4305; SSE-NEXT: movdqa 144(%rdi), %xmm1 4306; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4307; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4308; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] 4309; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4310; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] 4311; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] 4312; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] 4313; SSE-NEXT: movdqa 592(%rdi), %xmm1 4314; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4315; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4316; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] 4317; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4318; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 4319; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] 4320; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4321; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4322; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4323; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] 4324; SSE-NEXT: movdqa 32(%rdi), %xmm4 4325; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4326; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 4327; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 4328; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4329; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] 4330; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] 4331; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4332; SSE-NEXT: # xmm0 = mem[1,1,1,1] 4333; SSE-NEXT: movdqa 480(%rdi), %xmm4 4334; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4335; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 4336; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] 4337; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4338; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 4339; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] 4340; SSE-NEXT: movdqa %xmm11, %xmm4 4341; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] 4342; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4343; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 4344; SSE-NEXT: movdqa 368(%rdi), %xmm11 4345; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] 4346; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4347; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] 4348; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4349; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] 4350; SSE-NEXT: movdqa %xmm15, %xmm4 4351; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] 4352; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 4353; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] 4354; SSE-NEXT: movdqa 816(%rdi), %xmm7 4355; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4356; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 4357; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] 4358; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4359; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4360; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] 4361; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] 4362; SSE-NEXT: movdqa (%rsp), %xmm9 # 16-byte Reload 4363; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,1,1] 4364; SSE-NEXT: movdqa 256(%rdi), %xmm13 4365; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1] 4366; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] 4367; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4368; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 4369; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,2,2,2] 4370; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4371; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] 4372; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 4373; SSE-NEXT: # xmm7 = mem[1,1,1,1] 4374; SSE-NEXT: movdqa 704(%rdi), %xmm0 4375; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4376; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] 4377; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] 4378; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4379; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] 4380; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 4381; SSE-NEXT: # xmm8 = mem[1,1,1,1] 4382; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] 4383; SSE-NEXT: movdqa 176(%rdi), %xmm0 4384; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4385; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[2,3,2,3] 4386; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] 4387; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] 4388; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] 4389; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4390; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] 4391; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4392; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,1,1] 4393; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] 4394; SSE-NEXT: movdqa 64(%rdi), %xmm3 4395; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[2,3,2,3] 4396; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] 4397; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] 4398; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] 4399; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4400; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] 4401; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] 4402; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] 4403; SSE-NEXT: movdqa 400(%rdi), %xmm1 4404; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4405; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,3,2,3] 4406; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 4407; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] 4408; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] 4409; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4410; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3] 4411; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[1,1,1,1] 4412; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] 4413; SSE-NEXT: movdqa 288(%rdi), %xmm10 4414; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3] 4415; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] 4416; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4417; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] 4418; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] 4419; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4420; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 4421; SSE-NEXT: # xmm7 = mem[2,3,2,3] 4422; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 4423; SSE-NEXT: # xmm8 = mem[1,1,1,1] 4424; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] 4425; SSE-NEXT: movdqa 624(%rdi), %xmm1 4426; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4427; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 4428; SSE-NEXT: # xmm8 = mem[2,3,2,3] 4429; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 4430; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] 4431; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] 4432; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4433; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 4434; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3] 4435; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 4436; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[1,1,1,1] 4437; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] 4438; SSE-NEXT: movdqa 512(%rdi), %xmm1 4439; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4440; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 4441; SSE-NEXT: # xmm8 = mem[2,3,2,3] 4442; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 4443; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] 4444; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] 4445; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4446; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,3,2,3] 4447; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 4448; SSE-NEXT: # xmm8 = mem[1,1,1,1] 4449; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] 4450; SSE-NEXT: movdqa 848(%rdi), %xmm1 4451; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4452; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 4453; SSE-NEXT: # xmm8 = mem[2,3,2,3] 4454; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 4455; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] 4456; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] 4457; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4458; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4459; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] 4460; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 4461; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] 4462; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] 4463; SSE-NEXT: movdqa 736(%rdi), %xmm2 4464; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,3,2,3] 4465; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] 4466; SSE-NEXT: movdqa %xmm2, %xmm12 4467; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4468; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] 4469; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] 4470; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4471; SSE-NEXT: movdqa 96(%rdi), %xmm1 4472; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4473; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] 4474; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4475; SSE-NEXT: movdqa %xmm3, %xmm1 4476; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] 4477; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 4478; SSE-NEXT: # xmm5 = mem[2,2,3,3] 4479; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] 4480; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] 4481; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4482; SSE-NEXT: movdqa 208(%rdi), %xmm0 4483; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4484; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] 4485; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4486; SSE-NEXT: movdqa %xmm4, %xmm2 4487; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 4488; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4489; SSE-NEXT: # xmm1 = mem[2,2,3,3] 4490; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 4491; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] 4492; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 4493; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4494; SSE-NEXT: movdqa 320(%rdi), %xmm0 4495; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4496; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 4497; SSE-NEXT: movdqa %xmm10, %xmm2 4498; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4499; SSE-NEXT: pshufd $250, (%rsp), %xmm1 # 16-byte Folded Reload 4500; SSE-NEXT: # xmm1 = mem[2,2,3,3] 4501; SSE-NEXT: movdqa %xmm13, %xmm10 4502; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] 4503; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 4504; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill 4505; SSE-NEXT: movdqa 432(%rdi), %xmm0 4506; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4507; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 4508; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 4509; SSE-NEXT: movdqa %xmm13, %xmm2 4510; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4511; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4512; SSE-NEXT: # xmm1 = mem[2,2,3,3] 4513; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 4514; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] 4515; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 4516; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4517; SSE-NEXT: movdqa 544(%rdi), %xmm0 4518; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4519; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 4520; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 4521; SSE-NEXT: movdqa %xmm5, %xmm2 4522; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4523; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] 4524; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] 4525; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 4526; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4527; SSE-NEXT: movdqa 656(%rdi), %xmm0 4528; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4529; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 4530; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4531; SSE-NEXT: movdqa %xmm7, %xmm2 4532; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4533; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4534; SSE-NEXT: # xmm1 = mem[2,2,3,3] 4535; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 4536; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] 4537; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 4538; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4539; SSE-NEXT: movdqa 768(%rdi), %xmm0 4540; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4541; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 4542; SSE-NEXT: movdqa %xmm12, %xmm2 4543; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4544; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] 4545; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] 4546; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 4547; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4548; SSE-NEXT: movdqa 880(%rdi), %xmm0 4549; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4550; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 4551; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 4552; SSE-NEXT: movdqa %xmm11, %xmm0 4553; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4554; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4555; SSE-NEXT: # xmm1 = mem[2,2,3,3] 4556; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 4557; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] 4558; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 4559; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4560; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4561; SSE-NEXT: # xmm1 = mem[3,3,3,3] 4562; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4563; SSE-NEXT: movdqa %xmm6, %xmm2 4564; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4565; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] 4566; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4567; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 4568; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 4569; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4570; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] 4571; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 4572; SSE-NEXT: movdqa %xmm8, %xmm2 4573; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4574; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] 4575; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4576; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 4577; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 4578; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4579; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] 4580; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 4581; SSE-NEXT: movdqa %xmm9, %xmm1 4582; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4583; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4584; SSE-NEXT: # xmm0 = mem[2,2,3,3] 4585; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4586; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 4587; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 4588; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4589; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] 4590; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 4591; SSE-NEXT: movdqa %xmm10, %xmm1 4592; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4593; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] 4594; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4595; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4596; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 4597; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4598; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4599; SSE-NEXT: # xmm0 = mem[3,3,3,3] 4600; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 4601; SSE-NEXT: movdqa %xmm13, %xmm1 4602; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4603; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] 4604; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 4605; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 4606; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 4607; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4608; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] 4609; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 4610; SSE-NEXT: movdqa %xmm14, %xmm1 4611; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4612; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] 4613; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4614; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4615; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 4616; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4617; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4618; SSE-NEXT: # xmm0 = mem[3,3,3,3] 4619; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 4620; SSE-NEXT: movdqa %xmm15, %xmm1 4621; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4622; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4623; SSE-NEXT: # xmm0 = mem[2,2,3,3] 4624; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4625; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4626; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 4627; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4628; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] 4629; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4630; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4631; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] 4632; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4633; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 4634; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 4635; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4636; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] 4637; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4638; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4639; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] 4640; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4641; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4642; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 4643; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4644; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] 4645; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4646; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4647; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] 4648; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4649; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 4650; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 4651; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4652; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] 4653; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4654; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4655; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] 4656; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4657; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 4658; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 4659; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4660; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4661; SSE-NEXT: # xmm0 = mem[2,2,2,2] 4662; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4663; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4664; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] 4665; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 4666; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] 4667; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 4668; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4669; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] 4670; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4671; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4672; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] 4673; SSE-NEXT: movdqa %xmm13, %xmm5 4674; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4675; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 4676; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 4677; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4678; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 4679; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] 4680; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4681; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4682; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] 4683; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4684; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 4685; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 4686; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4687; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 4688; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] 4689; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4690; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4691; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] 4692; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 4693; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] 4694; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 4695; SSE-NEXT: movapd %xmm1, %xmm15 4696; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] 4697; SSE-NEXT: movdqa %xmm11, %xmm1 4698; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4699; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4700; SSE-NEXT: # xmm0 = mem[1,1,1,1] 4701; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 4702; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] 4703; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 4704; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4705; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4706; SSE-NEXT: # xmm0 = mem[1,1,1,1] 4707; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4708; SSE-NEXT: # xmm1 = mem[2,3,2,3] 4709; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4710; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4711; SSE-NEXT: # xmm0 = mem[2,3,2,3] 4712; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 4713; SSE-NEXT: # xmm9 = mem[0,0,1,1] 4714; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] 4715; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] 4716; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] 4717; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] 4718; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4719; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4720; SSE-NEXT: # xmm0 = mem[2,3,2,3] 4721; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 4722; SSE-NEXT: # xmm8 = mem[0,0,1,1] 4723; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] 4724; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] 4725; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] 4726; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4727; SSE-NEXT: # xmm1 = mem[2,3,2,3] 4728; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4729; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4730; SSE-NEXT: # xmm0 = mem[2,3,2,3] 4731; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 4732; SSE-NEXT: # xmm7 = mem[0,0,1,1] 4733; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] 4734; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] 4735; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] 4736; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4737; SSE-NEXT: # xmm1 = mem[2,3,2,3] 4738; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4739; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4740; SSE-NEXT: # xmm0 = mem[2,3,2,3] 4741; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 4742; SSE-NEXT: # xmm6 = mem[0,0,1,1] 4743; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] 4744; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] 4745; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] 4746; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] 4747; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4748; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4749; SSE-NEXT: # xmm0 = mem[2,3,2,3] 4750; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 4751; SSE-NEXT: # xmm5 = mem[0,0,1,1] 4752; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 4753; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] 4754; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] 4755; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4756; SSE-NEXT: # xmm1 = mem[2,3,2,3] 4757; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4758; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] 4759; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 4760; SSE-NEXT: # xmm4 = mem[0,0,1,1] 4761; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 4762; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] 4763; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] 4764; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4765; SSE-NEXT: # xmm0 = mem[2,3,2,3] 4766; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4767; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] 4768; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 4769; SSE-NEXT: # xmm3 = mem[0,0,1,1] 4770; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 4771; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] 4772; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] 4773; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4774; SSE-NEXT: # xmm1 = mem[2,3,2,3] 4775; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4776; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 4777; SSE-NEXT: # xmm2 = mem[2,3,2,3] 4778; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4779; SSE-NEXT: # xmm0 = mem[0,0,1,1] 4780; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 4781; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 4782; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4783; SSE-NEXT: movaps %xmm1, 96(%rsi) 4784; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4785; SSE-NEXT: movaps %xmm1, 32(%rsi) 4786; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4787; SSE-NEXT: movaps %xmm1, 112(%rsi) 4788; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4789; SSE-NEXT: movaps %xmm1, 48(%rsi) 4790; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4791; SSE-NEXT: movaps %xmm1, 64(%rsi) 4792; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4793; SSE-NEXT: movaps %xmm1, (%rsi) 4794; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4795; SSE-NEXT: movaps %xmm1, 80(%rsi) 4796; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4797; SSE-NEXT: movaps %xmm1, 16(%rsi) 4798; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4799; SSE-NEXT: movaps %xmm1, 96(%rdx) 4800; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4801; SSE-NEXT: movaps %xmm1, 32(%rdx) 4802; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4803; SSE-NEXT: movaps %xmm1, 112(%rdx) 4804; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4805; SSE-NEXT: movaps %xmm1, 48(%rdx) 4806; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4807; SSE-NEXT: movaps %xmm1, 64(%rdx) 4808; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4809; SSE-NEXT: movaps %xmm1, (%rdx) 4810; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4811; SSE-NEXT: movaps %xmm1, 80(%rdx) 4812; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4813; SSE-NEXT: movaps %xmm1, 16(%rdx) 4814; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4815; SSE-NEXT: movaps %xmm1, 96(%rcx) 4816; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4817; SSE-NEXT: movaps %xmm1, 112(%rcx) 4818; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4819; SSE-NEXT: movaps %xmm1, 64(%rcx) 4820; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4821; SSE-NEXT: movaps %xmm1, 80(%rcx) 4822; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4823; SSE-NEXT: movaps %xmm1, 32(%rcx) 4824; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4825; SSE-NEXT: movaps %xmm1, 48(%rcx) 4826; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4827; SSE-NEXT: movaps %xmm1, (%rcx) 4828; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4829; SSE-NEXT: movaps %xmm1, 16(%rcx) 4830; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4831; SSE-NEXT: movaps %xmm1, 112(%r8) 4832; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4833; SSE-NEXT: movaps %xmm1, 96(%r8) 4834; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4835; SSE-NEXT: movaps %xmm1, 80(%r8) 4836; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4837; SSE-NEXT: movaps %xmm1, 64(%r8) 4838; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4839; SSE-NEXT: movaps %xmm1, 48(%r8) 4840; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload 4841; SSE-NEXT: movaps %xmm1, 32(%r8) 4842; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4843; SSE-NEXT: movaps %xmm1, 16(%r8) 4844; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4845; SSE-NEXT: movaps %xmm1, (%r8) 4846; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4847; SSE-NEXT: movaps %xmm1, 112(%r9) 4848; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4849; SSE-NEXT: movaps %xmm1, 96(%r9) 4850; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4851; SSE-NEXT: movaps %xmm1, 80(%r9) 4852; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4853; SSE-NEXT: movaps %xmm1, 64(%r9) 4854; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4855; SSE-NEXT: movaps %xmm1, 48(%r9) 4856; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4857; SSE-NEXT: movaps %xmm1, 32(%r9) 4858; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4859; SSE-NEXT: movaps %xmm1, 16(%r9) 4860; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4861; SSE-NEXT: movaps %xmm1, (%r9) 4862; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 4863; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4864; SSE-NEXT: movaps %xmm1, 112(%rax) 4865; SSE-NEXT: movapd %xmm15, 96(%rax) 4866; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4867; SSE-NEXT: movaps %xmm1, 80(%rax) 4868; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4869; SSE-NEXT: movaps %xmm1, 64(%rax) 4870; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4871; SSE-NEXT: movaps %xmm1, 48(%rax) 4872; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4873; SSE-NEXT: movaps %xmm1, 32(%rax) 4874; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4875; SSE-NEXT: movaps %xmm1, 16(%rax) 4876; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4877; SSE-NEXT: movaps %xmm1, (%rax) 4878; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 4879; SSE-NEXT: movapd %xmm0, 112(%rax) 4880; SSE-NEXT: movapd %xmm3, 96(%rax) 4881; SSE-NEXT: movapd %xmm4, 80(%rax) 4882; SSE-NEXT: movapd %xmm5, 64(%rax) 4883; SSE-NEXT: movapd %xmm6, 48(%rax) 4884; SSE-NEXT: movapd %xmm7, 32(%rax) 4885; SSE-NEXT: movapd %xmm8, 16(%rax) 4886; SSE-NEXT: movapd %xmm9, (%rax) 4887; SSE-NEXT: addq $1160, %rsp # imm = 0x488 4888; SSE-NEXT: retq 4889; 4890; AVX-LABEL: load_i32_stride7_vf32: 4891; AVX: # %bb.0: 4892; AVX-NEXT: subq $1432, %rsp # imm = 0x598 4893; AVX-NEXT: vmovaps 480(%rdi), %ymm4 4894; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4895; AVX-NEXT: vmovaps 448(%rdi), %ymm3 4896; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4897; AVX-NEXT: vmovaps 544(%rdi), %ymm5 4898; AVX-NEXT: vmovaps 32(%rdi), %ymm2 4899; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4900; AVX-NEXT: vmovaps (%rdi), %ymm1 4901; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4902; AVX-NEXT: vmovaps 96(%rdi), %ymm12 4903; AVX-NEXT: vmovaps 80(%rdi), %xmm0 4904; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4905; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] 4906; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4907; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] 4908; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 4909; AVX-NEXT: vmovaps (%rdi), %xmm8 4910; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] 4911; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4912; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] 4913; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 4914; AVX-NEXT: vmovaps 160(%rdi), %xmm2 4915; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4916; AVX-NEXT: vmovaps 128(%rdi), %xmm1 4917; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4918; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 4919; AVX-NEXT: vmovaps 192(%rdi), %xmm7 4920; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm7[1] 4921; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4922; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 4923; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 4924; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4925; AVX-NEXT: vmovaps 528(%rdi), %xmm0 4926; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4927; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] 4928; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] 4929; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 4930; AVX-NEXT: vmovaps 448(%rdi), %xmm10 4931; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] 4932; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] 4933; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 4934; AVX-NEXT: vmovaps 608(%rdi), %xmm2 4935; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4936; AVX-NEXT: vmovaps 576(%rdi), %xmm1 4937; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4938; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 4939; AVX-NEXT: vmovaps 640(%rdi), %xmm9 4940; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm9[1] 4941; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4942; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 4943; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 4944; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4945; AVX-NEXT: vmovaps 256(%rdi), %ymm1 4946; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4947; AVX-NEXT: vmovaps 224(%rdi), %ymm0 4948; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4949; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 4950; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 4951; AVX-NEXT: vmovaps 224(%rdi), %xmm11 4952; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1],xmm0[2,3] 4953; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4954; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 4955; AVX-NEXT: vmovaps 320(%rdi), %ymm4 4956; AVX-NEXT: vmovaps 304(%rdi), %xmm1 4957; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4958; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] 4959; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4960; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 4961; AVX-NEXT: vmovaps 384(%rdi), %xmm1 4962; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4963; AVX-NEXT: vmovaps 352(%rdi), %xmm2 4964; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4965; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 4966; AVX-NEXT: vmovaps 416(%rdi), %xmm3 4967; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm3[1] 4968; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4969; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 4970; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 4971; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4972; AVX-NEXT: vmovaps 704(%rdi), %ymm1 4973; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4974; AVX-NEXT: vmovaps 672(%rdi), %ymm0 4975; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4976; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 4977; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 4978; AVX-NEXT: vmovaps 672(%rdi), %xmm1 4979; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4980; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 4981; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 4982; AVX-NEXT: vmovaps 768(%rdi), %ymm14 4983; AVX-NEXT: vmovaps 752(%rdi), %xmm1 4984; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4985; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm1[0],ymm14[2],ymm1[2] 4986; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 4987; AVX-NEXT: vmovaps 832(%rdi), %xmm2 4988; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4989; AVX-NEXT: vmovaps 800(%rdi), %xmm1 4990; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4991; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 4992; AVX-NEXT: vmovaps 864(%rdi), %xmm6 4993; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm6[1] 4994; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 4995; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 4996; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4997; AVX-NEXT: vmovaps 64(%rdi), %ymm0 4998; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4999; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1],ymm0[2,2],ymm12[5,5],ymm0[6,6] 5000; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 5001; AVX-NEXT: vmovaps 32(%rdi), %xmm1 5002; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5003; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3] 5004; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] 5005; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 5006; AVX-NEXT: vmovaps 160(%rdi), %ymm1 5007; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5008; AVX-NEXT: vmovaps 128(%rdi), %ymm15 5009; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[0,1] 5010; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,0],ymm1[3,3],ymm15[4,4],ymm1[7,7] 5011; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5012; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 5013; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm7[2] 5014; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 5015; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 5016; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5017; AVX-NEXT: vmovaps 512(%rdi), %ymm0 5018; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5019; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6] 5020; AVX-NEXT: vmovaps %ymm5, %ymm7 5021; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 5022; AVX-NEXT: vmovaps 480(%rdi), %xmm1 5023; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5024; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm10[1],xmm1[2,3] 5025; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5026; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3] 5027; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3,4,5,6,7] 5028; AVX-NEXT: vmovaps 608(%rdi), %ymm0 5029; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5030; AVX-NEXT: vmovaps 576(%rdi), %ymm12 5031; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm12[2,3],ymm0[0,1] 5032; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm12[0,0],ymm5[3,3],ymm12[4,4],ymm5[7,7] 5033; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5 5034; AVX-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm9[2] 5035; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 5036; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] 5037; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5038; AVX-NEXT: vmovaps 288(%rdi), %ymm0 5039; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5040; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1],ymm0[2,2],ymm4[5,5],ymm0[6,6] 5041; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] 5042; AVX-NEXT: vmovaps 256(%rdi), %xmm0 5043; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5044; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0],xmm11[1],xmm0[2,3] 5045; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0],mem[3,3] 5046; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] 5047; AVX-NEXT: vmovaps 384(%rdi), %ymm1 5048; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5049; AVX-NEXT: vmovaps 352(%rdi), %ymm0 5050; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5051; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[0,1] 5052; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm5[3,3],ymm0[4,4],ymm5[7,7] 5053; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5 5054; AVX-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm3[2] 5055; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 5056; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] 5057; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5058; AVX-NEXT: vmovaps 736(%rdi), %ymm5 5059; AVX-NEXT: vmovaps %ymm14, %ymm3 5060; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5061; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1],ymm5[2,2],ymm14[5,5],ymm5[6,6] 5062; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5063; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] 5064; AVX-NEXT: vmovaps 704(%rdi), %xmm4 5065; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5066; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0],xmm1[1],xmm4[2,3] 5067; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5068; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,0],mem[3,3] 5069; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm2[3,4,5,6,7] 5070; AVX-NEXT: vmovaps 832(%rdi), %ymm13 5071; AVX-NEXT: vmovaps 800(%rdi), %ymm2 5072; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm2[2,3],ymm13[0,1] 5073; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5074; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,0],ymm14[3,3],ymm2[4,4],ymm14[7,7] 5075; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5076; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 5077; AVX-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm6[2] 5078; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 5079; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] 5080; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5081; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 5082; AVX-NEXT: # xmm11 = mem[2,3,2,3] 5083; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5084; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm0[1],xmm11[2,3] 5085; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 5086; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 5087; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm9[0,3],ymm14[7,5],ymm9[4,7] 5088; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 5089; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm8[2,1],ymm14[2,0],ymm8[6,5],ymm14[6,4] 5090; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7] 5091; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload 5092; AVX-NEXT: # ymm14 = ymm15[0],mem[0],ymm15[2],mem[2] 5093; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 5094; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 5095; AVX-NEXT: # xmm14 = xmm14[0,1,2],mem[3] 5096; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 5097; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] 5098; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5099; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm10[2,3,2,3] 5100; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 5101; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3] 5102; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 5103; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 5104; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm10[0,3],ymm14[7,5],ymm10[4,7] 5105; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm7[2,1],ymm14[2,0],ymm7[6,5],ymm14[6,4] 5106; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7] 5107; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload 5108; AVX-NEXT: # ymm14 = ymm12[0],mem[0],ymm12[2],mem[2] 5109; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 5110; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 5111; AVX-NEXT: # xmm14 = xmm14[0,1,2],mem[3] 5112; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 5113; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] 5114; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5115; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm1[2,3,2,3] 5116; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm4[1],xmm11[2,3] 5117; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 5118; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm5[0,3],ymm14[7,5],ymm5[4,7] 5119; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm3[2,1],ymm14[2,0],ymm3[6,5],ymm14[6,4] 5120; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7] 5121; AVX-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm13[0],ymm2[2],ymm13[2] 5122; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 5123; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1,2],xmm6[3] 5124; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 5125; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3,4],ymm4[5,6,7] 5126; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5127; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 5128; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,3,2,3] 5129; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5130; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] 5131; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 5132; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5133; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,1],ymm3[0,3],ymm11[7,5],ymm3[4,7] 5134; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 5135; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm2[2,1],ymm11[2,0],ymm2[6,5],ymm11[6,4] 5136; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3,4,5,6,7] 5137; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 5138; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5139; AVX-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm13[0],ymm5[0],ymm13[2],ymm5[2] 5140; AVX-NEXT: vextractf128 $1, %ymm11, %xmm11 5141; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload 5142; AVX-NEXT: # xmm11 = xmm11[0,1,2],mem[3] 5143; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 5144; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm11[5,6,7] 5145; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5146; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm8[0,0],ymm9[5,4],ymm8[4,4] 5147; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,1],ymm4[0,2],ymm8[7,5],ymm4[4,6] 5148; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload 5149; AVX-NEXT: # xmm9 = xmm0[0,1,2],mem[3] 5150; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3] 5151; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3,4,5,6,7] 5152; AVX-NEXT: vmovaps 192(%rdi), %ymm0 5153; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5154; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 5155; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,1],ymm14[1,3],ymm0[4,5],ymm14[5,7] 5156; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5157; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,2],ymm11[2,0],ymm0[4,6],ymm11[6,4] 5158; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm11[5,6,7] 5159; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5160; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm10[1,0],ymm7[0,0],ymm10[5,4],ymm7[4,4] 5161; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm7[3,1],ymm4[0,2],ymm7[7,5],ymm4[4,6] 5162; AVX-NEXT: vmovaps %xmm15, %xmm10 5163; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload 5164; AVX-NEXT: # xmm8 = xmm15[0,1,2],mem[3] 5165; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] 5166; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm4[2,3,4,5,6,7] 5167; AVX-NEXT: vmovaps 640(%rdi), %ymm0 5168; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5169; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5170; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,1],ymm4[1,3],ymm0[4,5],ymm4[5,7] 5171; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,2],ymm11[2,0],ymm12[4,6],ymm11[6,4] 5172; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7] 5173; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5174; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm3[1,0],ymm2[0,0],ymm3[5,4],ymm2[4,4] 5175; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,1],ymm7[0,2],ymm2[7,5],ymm7[4,6] 5176; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm1[0,1,2],xmm6[3] 5177; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] 5178; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] 5179; AVX-NEXT: vmovaps 416(%rdi), %ymm15 5180; AVX-NEXT: vmovaps %ymm5, %ymm8 5181; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm15[0,1],ymm5[1,3],ymm15[4,5],ymm5[5,7] 5182; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm13[0,2],ymm11[2,0],ymm13[4,6],ymm11[6,4] 5183; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7] 5184; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5185; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5186; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5187; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] 5188; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,1],ymm5[0,2],ymm0[7,5],ymm5[4,6] 5189; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5190; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload 5191; AVX-NEXT: # xmm6 = mem[0,1,2],xmm0[3] 5192; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] 5193; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3,4,5,6,7] 5194; AVX-NEXT: vmovaps 864(%rdi), %ymm0 5195; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5196; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 5197; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,1],ymm9[1,3],ymm0[4,5],ymm9[5,7] 5198; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5199; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,2],ymm7[2,0],ymm0[4,6],ymm7[6,4] 5200; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5,6,7] 5201; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5202; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm12[2,3,0,1] 5203; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm6[0,0],ymm12[7,4],ymm6[4,4] 5204; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5205; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0],ymm4[2,0],ymm5[5,4],ymm4[6,4] 5206; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,0],ymm0[6,4],ymm6[6,4] 5207; AVX-NEXT: vmovaps 544(%rdi), %xmm1 5208; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5209; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm1[0,1,0,1] 5210; AVX-NEXT: vmovaps 512(%rdi), %xmm7 5211; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm7[0,1,2],xmm11[3] 5212; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm10[2,3,2,3] 5213; AVX-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2,3] 5214; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3] 5215; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5216; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5217; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5218; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] 5219; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] 5220; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 5221; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm14[2,0],ymm10[5,4],ymm14[6,4] 5222; AVX-NEXT: vmovaps %ymm14, %ymm6 5223; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] 5224; AVX-NEXT: vmovaps 64(%rdi), %xmm1 5225; AVX-NEXT: vmovaps 96(%rdi), %xmm4 5226; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[0,1,0,1] 5227; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5228; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] 5229; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 5230; AVX-NEXT: # xmm14 = mem[2,3,2,3] 5231; AVX-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] 5232; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3] 5233; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 5234; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5235; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] 5236; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm0[0,0],ymm13[7,4],ymm0[4,4] 5237; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm15[1,0],ymm8[2,0],ymm15[5,4],ymm8[6,4] 5238; AVX-NEXT: vmovaps %ymm15, %ymm11 5239; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5240; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] 5241; AVX-NEXT: vmovaps 320(%rdi), %xmm2 5242; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5243; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm2[0,1,0,1] 5244; AVX-NEXT: vmovaps 288(%rdi), %xmm3 5245; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3] 5246; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 5247; AVX-NEXT: # xmm15 = mem[2,3,2,3] 5248; AVX-NEXT: vblendps {{.*#+}} xmm15 = mem[0],xmm15[1],mem[2,3] 5249; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] 5250; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] 5251; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5252; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 5253; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] 5254; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] 5255; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 5256; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4] 5257; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,0],ymm2[2,0],ymm0[6,4],ymm2[6,4] 5258; AVX-NEXT: vmovaps 768(%rdi), %xmm0 5259; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5260; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm0[0,1,0,1] 5261; AVX-NEXT: vmovaps 736(%rdi), %xmm2 5262; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm2[0,1,2],xmm15[3] 5263; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 5264; AVX-NEXT: # xmm13 = mem[2,3,2,3] 5265; AVX-NEXT: vblendps {{.*#+}} xmm13 = mem[0],xmm13[1],mem[2,3] 5266; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] 5267; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm14[4,5,6,7] 5268; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5269; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm10[2,1],ymm6[3,3],ymm10[6,5],ymm6[7,7] 5270; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5271; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload 5272; AVX-NEXT: # xmm14 = xmm0[0],mem[1],xmm0[2,3] 5273; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 5274; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4] 5275; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] 5276; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5277; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 5278; AVX-NEXT: # ymm14 = ymm0[0,0],mem[1,0],ymm0[4,4],mem[5,4] 5279; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 5280; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm14[2,0],xmm1[3,2] 5281; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7] 5282; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5283; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload 5284; AVX-NEXT: # ymm13 = ymm5[2,1],mem[3,3],ymm5[6,5],mem[7,7] 5285; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 5286; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 5287; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0],xmm10[1],xmm15[2,3] 5288; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 5289; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4] 5290; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 5291; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3] 5292; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5293; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5294; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm4[1,0],ymm0[4,4],ymm4[5,4] 5295; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 5296; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm14[2,0],xmm7[3,2] 5297; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm13[4,5,6,7] 5298; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5299; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload 5300; AVX-NEXT: # ymm13 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7] 5301; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 5302; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload 5303; AVX-NEXT: # xmm14 = mem[0],xmm7[1],mem[2,3] 5304; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 5305; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4] 5306; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 5307; AVX-NEXT: # xmm3 = mem[0,1,2],xmm3[3] 5308; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 5309; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload 5310; AVX-NEXT: # ymm14 = ymm11[0,0],mem[1,0],ymm11[4,4],mem[5,4] 5311; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 5312; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm14[2,0],xmm3[3,2] 5313; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3],ymm13[4,5,6,7] 5314; AVX-NEXT: vmovaps %ymm8, %ymm7 5315; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm8[2,1],ymm9[3,3],ymm8[6,5],ymm9[7,7] 5316; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 5317; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 5318; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm9[0],xmm8[1],xmm9[2,3] 5319; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 5320; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm12[1,0],ymm3[2,0],ymm12[5,4],ymm3[6,4] 5321; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 5322; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] 5323; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 5324; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5325; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm1[0,0],ymm14[1,0],ymm1[4,4],ymm14[5,4] 5326; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 5327; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,0],xmm2[3,2] 5328; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm3[4,5,6,7] 5329; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5330; AVX-NEXT: # xmm2 = mem[0,1,0,1] 5331; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm6[3] 5332; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4] 5333; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 5334; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,3] 5335; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5336; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1] 5337; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4] 5338; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm10[2,3,2,3] 5339; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3] 5340; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 5341; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] 5342; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] 5343; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5344; AVX-NEXT: # xmm2 = mem[0,1,0,1] 5345; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm5[3] 5346; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm14[2,0],ymm1[5,4],ymm14[6,4] 5347; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 5348; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,0],xmm0[2,3] 5349; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm7[2,3,0,1] 5350; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,0],ymm2[0,0],ymm7[7,4],ymm2[4,4] 5351; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,3,2,3] 5352; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] 5353; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 5354; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] 5355; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 5356; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5357; AVX-NEXT: # xmm2 = mem[0,1,0,1] 5358; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 5359; AVX-NEXT: # xmm2 = xmm2[0,1,2],mem[3] 5360; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5361; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload 5362; AVX-NEXT: # ymm4 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4] 5363; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 5364; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,0],xmm2[2,3] 5365; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5366; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1] 5367; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,0],ymm4[0,0],ymm1[7,4],ymm4[4,4] 5368; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 5369; AVX-NEXT: # xmm5 = mem[2,3,2,3] 5370; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload 5371; AVX-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] 5372; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 5373; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,0],ymm5[4,5],ymm4[6,4] 5374; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 5375; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 5376; AVX-NEXT: # xmm4 = mem[0,1,0,1] 5377; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 5378; AVX-NEXT: # xmm4 = xmm4[0,1,2],mem[3] 5379; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload 5380; AVX-NEXT: # ymm5 = ymm11[1,0],mem[2,0],ymm11[5,4],mem[6,4] 5381; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5 5382; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[2,3] 5383; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5384; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3,0,1] 5385; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,0],ymm5[0,0],ymm1[7,4],ymm5[4,4] 5386; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 5387; AVX-NEXT: # xmm6 = mem[2,3,2,3] 5388; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload 5389; AVX-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] 5390; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 5391; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,0],ymm6[4,5],ymm5[6,4] 5392; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] 5393; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5394; AVX-NEXT: vmovaps %ymm1, 96(%rsi) 5395; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5396; AVX-NEXT: vmovaps %ymm1, 32(%rsi) 5397; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5398; AVX-NEXT: vmovaps %ymm5, 64(%rsi) 5399; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5400; AVX-NEXT: vmovaps %ymm5, (%rsi) 5401; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5402; AVX-NEXT: vmovaps %ymm1, 96(%rdx) 5403; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5404; AVX-NEXT: vmovaps %ymm1, 32(%rdx) 5405; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5406; AVX-NEXT: vmovaps %ymm1, 64(%rdx) 5407; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5408; AVX-NEXT: vmovaps %ymm1, (%rdx) 5409; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5410; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 5411; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5412; AVX-NEXT: vmovaps %ymm1, 96(%rcx) 5413; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5414; AVX-NEXT: vmovaps %ymm1, 64(%rcx) 5415; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5416; AVX-NEXT: vmovaps %ymm1, (%rcx) 5417; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5418; AVX-NEXT: vmovaps %ymm1, 96(%r8) 5419; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5420; AVX-NEXT: vmovaps %ymm1, 32(%r8) 5421; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5422; AVX-NEXT: vmovaps %ymm1, 64(%r8) 5423; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5424; AVX-NEXT: vmovaps %ymm1, (%r8) 5425; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5426; AVX-NEXT: vmovaps %ymm1, 96(%r9) 5427; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5428; AVX-NEXT: vmovaps %ymm1, 32(%r9) 5429; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5430; AVX-NEXT: vmovaps %ymm1, (%r9) 5431; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5432; AVX-NEXT: vmovaps %ymm1, 64(%r9) 5433; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 5434; AVX-NEXT: vmovaps %ymm12, 96(%rax) 5435; AVX-NEXT: vmovaps %ymm13, 32(%rax) 5436; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5437; AVX-NEXT: vmovaps %ymm1, 64(%rax) 5438; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5439; AVX-NEXT: vmovaps %ymm1, (%rax) 5440; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 5441; AVX-NEXT: vmovaps %ymm4, 32(%rax) 5442; AVX-NEXT: vmovaps %ymm2, (%rax) 5443; AVX-NEXT: vmovaps %ymm0, 96(%rax) 5444; AVX-NEXT: vmovaps %ymm3, 64(%rax) 5445; AVX-NEXT: addq $1432, %rsp # imm = 0x598 5446; AVX-NEXT: vzeroupper 5447; AVX-NEXT: retq 5448; 5449; AVX2-LABEL: load_i32_stride7_vf32: 5450; AVX2: # %bb.0: 5451; AVX2-NEXT: subq $1192, %rsp # imm = 0x4A8 5452; AVX2-NEXT: vmovdqa 320(%rdi), %ymm9 5453; AVX2-NEXT: vmovdqa 256(%rdi), %ymm4 5454; AVX2-NEXT: vmovdqa 224(%rdi), %ymm5 5455; AVX2-NEXT: vmovdqa 544(%rdi), %ymm12 5456; AVX2-NEXT: vmovdqa 480(%rdi), %ymm7 5457; AVX2-NEXT: vmovdqa 448(%rdi), %ymm8 5458; AVX2-NEXT: vmovdqa (%rdi), %ymm14 5459; AVX2-NEXT: vmovdqa 32(%rdi), %ymm13 5460; AVX2-NEXT: vmovdqa 96(%rdi), %ymm11 5461; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm0 5462; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm11[4,5,6,7] 5463; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] 5464; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] 5465; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5466; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5467; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 5468; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 5469; AVX2-NEXT: vmovdqa 128(%rdi), %xmm2 5470; AVX2-NEXT: vmovdqa 160(%rdi), %xmm3 5471; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5472; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 5473; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 5474; AVX2-NEXT: vpbroadcastd 196(%rdi), %ymm3 5475; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 5476; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 5477; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5478; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] 5479; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5480; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5481; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 5482; AVX2-NEXT: vpbroadcastq 528(%rdi), %ymm2 5483; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] 5484; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 5485; AVX2-NEXT: vmovdqa 576(%rdi), %xmm2 5486; AVX2-NEXT: vmovdqa 608(%rdi), %xmm3 5487; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5488; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 5489; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 5490; AVX2-NEXT: vpbroadcastd 644(%rdi), %ymm3 5491; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 5492; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 5493; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5494; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] 5495; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5496; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5497; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 5498; AVX2-NEXT: vpbroadcastq 304(%rdi), %ymm2 5499; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] 5500; AVX2-NEXT: vmovdqa %ymm9, %ymm10 5501; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5502; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 5503; AVX2-NEXT: vmovdqa 352(%rdi), %xmm2 5504; AVX2-NEXT: vmovdqa 384(%rdi), %xmm3 5505; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5506; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 5507; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 5508; AVX2-NEXT: vpbroadcastd 420(%rdi), %ymm3 5509; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 5510; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 5511; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5512; AVX2-NEXT: vmovdqa 704(%rdi), %ymm2 5513; AVX2-NEXT: vmovdqa 672(%rdi), %ymm6 5514; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6],ymm6[7] 5515; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5516; AVX2-NEXT: vmovdqa %ymm2, %ymm3 5517; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5518; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm0 5519; AVX2-NEXT: vmovdqa 768(%rdi), %ymm15 5520; AVX2-NEXT: vpbroadcastq 752(%rdi), %ymm1 5521; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] 5522; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 5523; AVX2-NEXT: vmovdqa 800(%rdi), %xmm1 5524; AVX2-NEXT: vmovdqa 832(%rdi), %xmm2 5525; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5526; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] 5527; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5528; AVX2-NEXT: vpbroadcastd 868(%rdi), %ymm2 5529; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 5530; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 5531; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5532; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] 5533; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 5534; AVX2-NEXT: vmovdqa 608(%rdi), %ymm2 5535; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5536; AVX2-NEXT: vmovdqa 576(%rdi), %ymm1 5537; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5538; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] 5539; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] 5540; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] 5541; AVX2-NEXT: vmovdqa 512(%rdi), %ymm9 5542; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7] 5543; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5544; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5545; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] 5546; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5547; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] 5548; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] 5549; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 5550; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 5551; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5552; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 5553; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5554; AVX2-NEXT: vmovdqa 384(%rdi), %ymm7 5555; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5556; AVX2-NEXT: vmovdqa 352(%rdi), %ymm2 5557; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5558; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 5559; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 5560; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 5561; AVX2-NEXT: vmovdqa 288(%rdi), %ymm2 5562; AVX2-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill 5563; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] 5564; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] 5565; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5566; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7] 5567; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 5568; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 5569; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5570; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 5571; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5572; AVX2-NEXT: vmovdqa 832(%rdi), %ymm8 5573; AVX2-NEXT: vmovdqa 800(%rdi), %ymm10 5574; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] 5575; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5576; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5577; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 5578; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 5579; AVX2-NEXT: vmovdqa 736(%rdi), %ymm7 5580; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7] 5581; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5582; AVX2-NEXT: vmovdqa %ymm15, %ymm5 5583; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5584; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] 5585; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5586; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7] 5587; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 5588; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 5589; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5590; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 5591; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5592; AVX2-NEXT: vmovdqa 160(%rdi), %ymm6 5593; AVX2-NEXT: vmovdqa 128(%rdi), %ymm15 5594; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] 5595; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5596; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 5597; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7] 5598; AVX2-NEXT: vmovdqa 64(%rdi), %ymm1 5599; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5600; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] 5601; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] 5602; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5603; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7] 5604; AVX2-NEXT: vpermd %ymm3, %ymm0, %ymm0 5605; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 5606; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5607; AVX2-NEXT: vmovdqa 80(%rdi), %xmm0 5608; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 5609; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] 5610; AVX2-NEXT: vpbroadcastd 8(%rdi), %xmm2 5611; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3 5612; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] 5613; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 5614; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] 5615; AVX2-NEXT: vmovdqa %ymm6, %ymm11 5616; AVX2-NEXT: vpbroadcastd 204(%rdi), %ymm4 5617; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] 5618; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 5619; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5620; AVX2-NEXT: vmovdqa 528(%rdi), %xmm0 5621; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] 5622; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] 5623; AVX2-NEXT: vpbroadcastd 456(%rdi), %xmm4 5624; AVX2-NEXT: vmovdqa 480(%rdi), %xmm2 5625; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] 5626; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] 5627; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 5628; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 5629; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] 5630; AVX2-NEXT: vpbroadcastd 652(%rdi), %ymm15 5631; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm15[7] 5632; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] 5633; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5634; AVX2-NEXT: vmovdqa 752(%rdi), %xmm0 5635; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] 5636; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] 5637; AVX2-NEXT: vpbroadcastd 680(%rdi), %xmm15 5638; AVX2-NEXT: vmovdqa 704(%rdi), %xmm7 5639; AVX2-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] 5640; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] 5641; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] 5642; AVX2-NEXT: vpbroadcastd 876(%rdi), %ymm13 5643; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] 5644; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] 5645; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5646; AVX2-NEXT: vmovdqa 304(%rdi), %xmm0 5647; AVX2-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload 5648; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5649; AVX2-NEXT: vpalignr {{.*#+}} ymm13 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] 5650; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7] 5651; AVX2-NEXT: vpbroadcastd 232(%rdi), %xmm15 5652; AVX2-NEXT: vmovdqa 256(%rdi), %xmm0 5653; AVX2-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] 5654; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7] 5655; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5656; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 5657; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] 5658; AVX2-NEXT: vpbroadcastd 428(%rdi), %ymm14 5659; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] 5660; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] 5661; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5662; AVX2-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 5663; AVX2-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] 5664; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] 5665; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] 5666; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] 5667; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] 5668; AVX2-NEXT: vmovdqa %ymm11, %ymm13 5669; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 5670; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,2],ymm11[1,3],ymm10[4,6],ymm11[5,7] 5671; AVX2-NEXT: vbroadcastss 208(%rdi), %ymm11 5672; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] 5673; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] 5674; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5675; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5676; AVX2-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 5677; AVX2-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] 5678; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] 5679; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] 5680; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] 5681; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] 5682; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,2],ymm9[1,3],ymm12[4,6],ymm9[5,7] 5683; AVX2-NEXT: vbroadcastss 656(%rdi), %ymm3 5684; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 5685; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 5686; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5687; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7] 5688; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] 5689; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] 5690; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] 5691; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 5692; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] 5693; AVX2-NEXT: vmovaps %ymm6, %ymm15 5694; AVX2-NEXT: vbroadcastss 432(%rdi), %ymm2 5695; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 5696; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 5697; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 5698; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5699; AVX2-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 5700; AVX2-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] 5701; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1,2],mem[3] 5702; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] 5703; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 5704; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 5705; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 5706; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5707; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm6[1,3],ymm14[4,6],ymm6[5,7] 5708; AVX2-NEXT: vbroadcastss 880(%rdi), %ymm2 5709; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 5710; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 5711; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5712; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = [4,3,0,0] 5713; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5714; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 5715; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] 5716; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 5717; AVX2-NEXT: vbroadcastss 548(%rdi), %xmm2 5718; AVX2-NEXT: vmovaps 512(%rdi), %xmm7 5719; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3] 5720; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] 5721; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] 5722; AVX2-NEXT: vpermps %ymm12, %ymm11, %ymm2 5723; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] 5724; AVX2-NEXT: vmovaps %ymm9, %ymm12 5725; AVX2-NEXT: vbroadcastss 660(%rdi), %ymm3 5726; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 5727; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 5728; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5729; AVX2-NEXT: vbroadcastss 100(%rdi), %xmm2 5730; AVX2-NEXT: vmovaps 64(%rdi), %xmm0 5731; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] 5732; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5733; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 5734; AVX2-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] 5735; AVX2-NEXT: vpermps %ymm3, %ymm1, %ymm3 5736; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] 5737; AVX2-NEXT: vpermps %ymm10, %ymm11, %ymm3 5738; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] 5739; AVX2-NEXT: vbroadcastss 212(%rdi), %ymm4 5740; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] 5741; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 5742; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5743; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 5744; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 5745; AVX2-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] 5746; AVX2-NEXT: vpermps %ymm2, %ymm1, %ymm3 5747; AVX2-NEXT: vbroadcastss 324(%rdi), %xmm4 5748; AVX2-NEXT: vmovaps 288(%rdi), %xmm2 5749; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] 5750; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] 5751; AVX2-NEXT: vpermps %ymm15, %ymm11, %ymm4 5752; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] 5753; AVX2-NEXT: vmovaps %ymm8, %ymm9 5754; AVX2-NEXT: vbroadcastss 436(%rdi), %ymm8 5755; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] 5756; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] 5757; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5758; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5759; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 5760; AVX2-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] 5761; AVX2-NEXT: vpermps %ymm3, %ymm1, %ymm1 5762; AVX2-NEXT: vbroadcastss 772(%rdi), %xmm4 5763; AVX2-NEXT: vmovaps 736(%rdi), %xmm3 5764; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] 5765; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] 5766; AVX2-NEXT: vpermps %ymm14, %ymm11, %ymm4 5767; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] 5768; AVX2-NEXT: vbroadcastss 884(%rdi), %ymm8 5769; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] 5770; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] 5771; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5772; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] 5773; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] 5774; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] 5775; AVX2-NEXT: vbroadcastss 216(%rdi), %ymm4 5776; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] 5777; AVX2-NEXT: vmovaps 96(%rdi), %xmm10 5778; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] 5779; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] 5780; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 5781; AVX2-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] 5782; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4 5783; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] 5784; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5785; AVX2-NEXT: vmovaps 544(%rdi), %xmm4 5786; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm7[3] 5787; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] 5788; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 5789; AVX2-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] 5790; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 5791; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] 5792; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload 5793; AVX2-NEXT: # ymm5 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] 5794; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] 5795; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] 5796; AVX2-NEXT: vbroadcastss 664(%rdi), %ymm7 5797; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] 5798; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] 5799; AVX2-NEXT: vmovaps 320(%rdi), %xmm12 5800; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] 5801; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] 5802; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 5803; AVX2-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] 5804; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 5805; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] 5806; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] 5807; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] 5808; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] 5809; AVX2-NEXT: vbroadcastss 440(%rdi), %ymm7 5810; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] 5811; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] 5812; AVX2-NEXT: vmovaps 768(%rdi), %xmm2 5813; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] 5814; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] 5815; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload 5816; AVX2-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] 5817; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm7 5818; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] 5819; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7] 5820; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] 5821; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] 5822; AVX2-NEXT: vbroadcastss 888(%rdi), %ymm8 5823; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] 5824; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7] 5825; AVX2-NEXT: vbroadcastss 584(%rdi), %xmm3 5826; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 5827; AVX2-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] 5828; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 5829; AVX2-NEXT: vpermps 640(%rdi), %ymm11, %ymm8 5830; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] 5831; AVX2-NEXT: vbroadcastss 528(%rdi), %ymm8 5832; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] 5833; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload 5834; AVX2-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] 5835; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload 5836; AVX2-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] 5837; AVX2-NEXT: vextractf128 $1, %ymm8, %xmm8 5838; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] 5839; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7] 5840; AVX2-NEXT: vbroadcastss 808(%rdi), %xmm3 5841; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 5842; AVX2-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] 5843; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 5844; AVX2-NEXT: vpermps 864(%rdi), %ymm11, %ymm4 5845; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] 5846; AVX2-NEXT: vbroadcastss 752(%rdi), %ymm4 5847; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] 5848; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 5849; AVX2-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] 5850; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 5851; AVX2-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] 5852; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4 5853; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] 5854; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 5855; AVX2-NEXT: vbroadcastss 136(%rdi), %xmm3 5856; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 5857; AVX2-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] 5858; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 5859; AVX2-NEXT: vpermps 192(%rdi), %ymm11, %ymm4 5860; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] 5861; AVX2-NEXT: vbroadcastss 80(%rdi), %ymm4 5862; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] 5863; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 5864; AVX2-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] 5865; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 5866; AVX2-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] 5867; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 5868; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] 5869; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 5870; AVX2-NEXT: vbroadcastss 360(%rdi), %xmm4 5871; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 5872; AVX2-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] 5873; AVX2-NEXT: vpermps 416(%rdi), %ymm11, %ymm6 5874; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 5875; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] 5876; AVX2-NEXT: vbroadcastss 304(%rdi), %ymm6 5877; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] 5878; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload 5879; AVX2-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] 5880; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload 5881; AVX2-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] 5882; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm10 5883; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] 5884; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] 5885; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5886; AVX2-NEXT: vmovaps %ymm6, 96(%rsi) 5887; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5888; AVX2-NEXT: vmovaps %ymm6, 32(%rsi) 5889; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5890; AVX2-NEXT: vmovaps %ymm6, 64(%rsi) 5891; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5892; AVX2-NEXT: vmovaps %ymm6, (%rsi) 5893; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5894; AVX2-NEXT: vmovaps %ymm6, 96(%rdx) 5895; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5896; AVX2-NEXT: vmovaps %ymm6, 32(%rdx) 5897; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5898; AVX2-NEXT: vmovaps %ymm6, 64(%rdx) 5899; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5900; AVX2-NEXT: vmovaps %ymm6, (%rdx) 5901; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5902; AVX2-NEXT: vmovaps %ymm6, 32(%rcx) 5903; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5904; AVX2-NEXT: vmovaps %ymm6, 96(%rcx) 5905; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5906; AVX2-NEXT: vmovaps %ymm6, 64(%rcx) 5907; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5908; AVX2-NEXT: vmovaps %ymm6, (%rcx) 5909; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5910; AVX2-NEXT: vmovaps %ymm6, 96(%r8) 5911; AVX2-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload 5912; AVX2-NEXT: vmovaps %ymm6, 32(%r8) 5913; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5914; AVX2-NEXT: vmovaps %ymm6, 64(%r8) 5915; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5916; AVX2-NEXT: vmovaps %ymm6, (%r8) 5917; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5918; AVX2-NEXT: vmovaps %ymm6, 96(%r9) 5919; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5920; AVX2-NEXT: vmovaps %ymm6, 32(%r9) 5921; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5922; AVX2-NEXT: vmovaps %ymm6, (%r9) 5923; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 5924; AVX2-NEXT: vmovaps %ymm6, 64(%r9) 5925; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 5926; AVX2-NEXT: vmovaps %ymm7, 96(%rax) 5927; AVX2-NEXT: vmovaps %ymm5, 32(%rax) 5928; AVX2-NEXT: vmovaps %ymm1, 64(%rax) 5929; AVX2-NEXT: vmovaps %ymm0, (%rax) 5930; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 5931; AVX2-NEXT: vmovaps %ymm4, 32(%rax) 5932; AVX2-NEXT: vmovaps %ymm3, (%rax) 5933; AVX2-NEXT: vmovaps %ymm2, 96(%rax) 5934; AVX2-NEXT: vmovaps %ymm8, 64(%rax) 5935; AVX2-NEXT: addq $1192, %rsp # imm = 0x4A8 5936; AVX2-NEXT: vzeroupper 5937; AVX2-NEXT: retq 5938; 5939; AVX2-FP-LABEL: load_i32_stride7_vf32: 5940; AVX2-FP: # %bb.0: 5941; AVX2-FP-NEXT: subq $1192, %rsp # imm = 0x4A8 5942; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm9 5943; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm4 5944; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm5 5945; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm12 5946; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm7 5947; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm8 5948; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm14 5949; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm13 5950; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm11 5951; AVX2-FP-NEXT: vpbroadcastq 80(%rdi), %ymm0 5952; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm11[4,5,6,7] 5953; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] 5954; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] 5955; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5956; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5957; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 5958; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 5959; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm2 5960; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm3 5961; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5962; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 5963; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 5964; AVX2-FP-NEXT: vpbroadcastd 196(%rdi), %ymm3 5965; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 5966; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 5967; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5968; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] 5969; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5970; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5971; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 5972; AVX2-FP-NEXT: vpbroadcastq 528(%rdi), %ymm2 5973; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] 5974; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 5975; AVX2-FP-NEXT: vmovdqa 576(%rdi), %xmm2 5976; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm3 5977; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5978; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 5979; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 5980; AVX2-FP-NEXT: vpbroadcastd 644(%rdi), %ymm3 5981; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 5982; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 5983; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5984; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] 5985; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5986; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5987; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 5988; AVX2-FP-NEXT: vpbroadcastq 304(%rdi), %ymm2 5989; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] 5990; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm10 5991; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5992; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 5993; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm2 5994; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm3 5995; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5996; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 5997; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 5998; AVX2-FP-NEXT: vpbroadcastd 420(%rdi), %ymm3 5999; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 6000; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 6001; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6002; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm2 6003; AVX2-FP-NEXT: vmovdqa 672(%rdi), %ymm6 6004; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6],ymm6[7] 6005; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6006; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm3 6007; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6008; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm0 6009; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm15 6010; AVX2-FP-NEXT: vpbroadcastq 752(%rdi), %ymm1 6011; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] 6012; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 6013; AVX2-FP-NEXT: vmovdqa 800(%rdi), %xmm1 6014; AVX2-FP-NEXT: vmovdqa 832(%rdi), %xmm2 6015; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6016; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] 6017; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6018; AVX2-FP-NEXT: vpbroadcastd 868(%rdi), %ymm2 6019; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 6020; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 6021; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6022; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] 6023; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6024; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm2 6025; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6026; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm1 6027; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6028; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] 6029; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] 6030; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] 6031; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm9 6032; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7] 6033; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6034; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6035; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] 6036; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6037; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] 6038; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] 6039; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 6040; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 6041; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6042; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 6043; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6044; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm7 6045; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6046; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm2 6047; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6048; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 6049; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 6050; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 6051; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm2 6052; AVX2-FP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill 6053; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] 6054; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] 6055; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6056; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7] 6057; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 6058; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 6059; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6060; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 6061; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6062; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm8 6063; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm10 6064; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] 6065; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6066; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6067; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 6068; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 6069; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm7 6070; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7] 6071; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6072; AVX2-FP-NEXT: vmovdqa %ymm15, %ymm5 6073; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6074; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] 6075; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6076; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7] 6077; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 6078; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 6079; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6080; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 6081; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6082; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm6 6083; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm15 6084; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] 6085; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6086; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 6087; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7] 6088; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm1 6089; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6090; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] 6091; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] 6092; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6093; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7] 6094; AVX2-FP-NEXT: vpermd %ymm3, %ymm0, %ymm0 6095; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 6096; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6097; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm0 6098; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 6099; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] 6100; AVX2-FP-NEXT: vpbroadcastd 8(%rdi), %xmm2 6101; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm3 6102; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] 6103; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 6104; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] 6105; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm11 6106; AVX2-FP-NEXT: vpbroadcastd 204(%rdi), %ymm4 6107; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] 6108; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 6109; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6110; AVX2-FP-NEXT: vmovdqa 528(%rdi), %xmm0 6111; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] 6112; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] 6113; AVX2-FP-NEXT: vpbroadcastd 456(%rdi), %xmm4 6114; AVX2-FP-NEXT: vmovdqa 480(%rdi), %xmm2 6115; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] 6116; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] 6117; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 6118; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 6119; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] 6120; AVX2-FP-NEXT: vpbroadcastd 652(%rdi), %ymm15 6121; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm15[7] 6122; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] 6123; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6124; AVX2-FP-NEXT: vmovdqa 752(%rdi), %xmm0 6125; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] 6126; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] 6127; AVX2-FP-NEXT: vpbroadcastd 680(%rdi), %xmm15 6128; AVX2-FP-NEXT: vmovdqa 704(%rdi), %xmm7 6129; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] 6130; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] 6131; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] 6132; AVX2-FP-NEXT: vpbroadcastd 876(%rdi), %ymm13 6133; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] 6134; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] 6135; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6136; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm0 6137; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload 6138; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 6139; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm13 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] 6140; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7] 6141; AVX2-FP-NEXT: vpbroadcastd 232(%rdi), %xmm15 6142; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm0 6143; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] 6144; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7] 6145; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6146; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 6147; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] 6148; AVX2-FP-NEXT: vpbroadcastd 428(%rdi), %ymm14 6149; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] 6150; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] 6151; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6152; AVX2-FP-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 6153; AVX2-FP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] 6154; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] 6155; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] 6156; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] 6157; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] 6158; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm13 6159; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 6160; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,2],ymm11[1,3],ymm10[4,6],ymm11[5,7] 6161; AVX2-FP-NEXT: vbroadcastss 208(%rdi), %ymm11 6162; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] 6163; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] 6164; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6165; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6166; AVX2-FP-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 6167; AVX2-FP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] 6168; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] 6169; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] 6170; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] 6171; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] 6172; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,2],ymm9[1,3],ymm12[4,6],ymm9[5,7] 6173; AVX2-FP-NEXT: vbroadcastss 656(%rdi), %ymm3 6174; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 6175; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 6176; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6177; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7] 6178; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] 6179; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] 6180; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] 6181; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 6182; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] 6183; AVX2-FP-NEXT: vmovaps %ymm6, %ymm15 6184; AVX2-FP-NEXT: vbroadcastss 432(%rdi), %ymm2 6185; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 6186; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 6187; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 6188; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6189; AVX2-FP-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 6190; AVX2-FP-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] 6191; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1,2],mem[3] 6192; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] 6193; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 6194; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 6195; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 6196; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6197; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm6[1,3],ymm14[4,6],ymm6[5,7] 6198; AVX2-FP-NEXT: vbroadcastss 880(%rdi), %ymm2 6199; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 6200; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 6201; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6202; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm1 = [4,3,0,0] 6203; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6204; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 6205; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] 6206; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm0 6207; AVX2-FP-NEXT: vbroadcastss 548(%rdi), %xmm2 6208; AVX2-FP-NEXT: vmovaps 512(%rdi), %xmm7 6209; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3] 6210; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] 6211; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] 6212; AVX2-FP-NEXT: vpermps %ymm12, %ymm11, %ymm2 6213; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] 6214; AVX2-FP-NEXT: vmovaps %ymm9, %ymm12 6215; AVX2-FP-NEXT: vbroadcastss 660(%rdi), %ymm3 6216; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 6217; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 6218; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6219; AVX2-FP-NEXT: vbroadcastss 100(%rdi), %xmm2 6220; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm0 6221; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] 6222; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6223; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 6224; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] 6225; AVX2-FP-NEXT: vpermps %ymm3, %ymm1, %ymm3 6226; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] 6227; AVX2-FP-NEXT: vpermps %ymm10, %ymm11, %ymm3 6228; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] 6229; AVX2-FP-NEXT: vbroadcastss 212(%rdi), %ymm4 6230; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] 6231; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 6232; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6233; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6234; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 6235; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] 6236; AVX2-FP-NEXT: vpermps %ymm2, %ymm1, %ymm3 6237; AVX2-FP-NEXT: vbroadcastss 324(%rdi), %xmm4 6238; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm2 6239; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] 6240; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] 6241; AVX2-FP-NEXT: vpermps %ymm15, %ymm11, %ymm4 6242; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] 6243; AVX2-FP-NEXT: vmovaps %ymm8, %ymm9 6244; AVX2-FP-NEXT: vbroadcastss 436(%rdi), %ymm8 6245; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] 6246; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] 6247; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6248; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6249; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 6250; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] 6251; AVX2-FP-NEXT: vpermps %ymm3, %ymm1, %ymm1 6252; AVX2-FP-NEXT: vbroadcastss 772(%rdi), %xmm4 6253; AVX2-FP-NEXT: vmovaps 736(%rdi), %xmm3 6254; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] 6255; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] 6256; AVX2-FP-NEXT: vpermps %ymm14, %ymm11, %ymm4 6257; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] 6258; AVX2-FP-NEXT: vbroadcastss 884(%rdi), %ymm8 6259; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] 6260; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] 6261; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6262; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] 6263; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] 6264; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] 6265; AVX2-FP-NEXT: vbroadcastss 216(%rdi), %ymm4 6266; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] 6267; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm10 6268; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] 6269; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] 6270; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 6271; AVX2-FP-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] 6272; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4 6273; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] 6274; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6275; AVX2-FP-NEXT: vmovaps 544(%rdi), %xmm4 6276; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm7[3] 6277; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] 6278; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 6279; AVX2-FP-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] 6280; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 6281; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] 6282; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload 6283; AVX2-FP-NEXT: # ymm5 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] 6284; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] 6285; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] 6286; AVX2-FP-NEXT: vbroadcastss 664(%rdi), %ymm7 6287; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] 6288; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] 6289; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm12 6290; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] 6291; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] 6292; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 6293; AVX2-FP-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] 6294; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 6295; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] 6296; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] 6297; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] 6298; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] 6299; AVX2-FP-NEXT: vbroadcastss 440(%rdi), %ymm7 6300; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] 6301; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] 6302; AVX2-FP-NEXT: vmovaps 768(%rdi), %xmm2 6303; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] 6304; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] 6305; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload 6306; AVX2-FP-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] 6307; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm7 6308; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] 6309; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7] 6310; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] 6311; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] 6312; AVX2-FP-NEXT: vbroadcastss 888(%rdi), %ymm8 6313; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] 6314; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7] 6315; AVX2-FP-NEXT: vbroadcastss 584(%rdi), %xmm3 6316; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 6317; AVX2-FP-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] 6318; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 6319; AVX2-FP-NEXT: vpermps 640(%rdi), %ymm11, %ymm8 6320; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] 6321; AVX2-FP-NEXT: vbroadcastss 528(%rdi), %ymm8 6322; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] 6323; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload 6324; AVX2-FP-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] 6325; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload 6326; AVX2-FP-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] 6327; AVX2-FP-NEXT: vextractf128 $1, %ymm8, %xmm8 6328; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] 6329; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7] 6330; AVX2-FP-NEXT: vbroadcastss 808(%rdi), %xmm3 6331; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 6332; AVX2-FP-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] 6333; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 6334; AVX2-FP-NEXT: vpermps 864(%rdi), %ymm11, %ymm4 6335; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] 6336; AVX2-FP-NEXT: vbroadcastss 752(%rdi), %ymm4 6337; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] 6338; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 6339; AVX2-FP-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] 6340; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 6341; AVX2-FP-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] 6342; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4 6343; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] 6344; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 6345; AVX2-FP-NEXT: vbroadcastss 136(%rdi), %xmm3 6346; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 6347; AVX2-FP-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] 6348; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 6349; AVX2-FP-NEXT: vpermps 192(%rdi), %ymm11, %ymm4 6350; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] 6351; AVX2-FP-NEXT: vbroadcastss 80(%rdi), %ymm4 6352; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] 6353; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 6354; AVX2-FP-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] 6355; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 6356; AVX2-FP-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] 6357; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 6358; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] 6359; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 6360; AVX2-FP-NEXT: vbroadcastss 360(%rdi), %xmm4 6361; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 6362; AVX2-FP-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] 6363; AVX2-FP-NEXT: vpermps 416(%rdi), %ymm11, %ymm6 6364; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 6365; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] 6366; AVX2-FP-NEXT: vbroadcastss 304(%rdi), %ymm6 6367; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] 6368; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload 6369; AVX2-FP-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] 6370; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload 6371; AVX2-FP-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] 6372; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm10 6373; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] 6374; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] 6375; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6376; AVX2-FP-NEXT: vmovaps %ymm6, 96(%rsi) 6377; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6378; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rsi) 6379; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6380; AVX2-FP-NEXT: vmovaps %ymm6, 64(%rsi) 6381; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6382; AVX2-FP-NEXT: vmovaps %ymm6, (%rsi) 6383; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6384; AVX2-FP-NEXT: vmovaps %ymm6, 96(%rdx) 6385; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6386; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rdx) 6387; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6388; AVX2-FP-NEXT: vmovaps %ymm6, 64(%rdx) 6389; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6390; AVX2-FP-NEXT: vmovaps %ymm6, (%rdx) 6391; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6392; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rcx) 6393; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6394; AVX2-FP-NEXT: vmovaps %ymm6, 96(%rcx) 6395; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6396; AVX2-FP-NEXT: vmovaps %ymm6, 64(%rcx) 6397; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6398; AVX2-FP-NEXT: vmovaps %ymm6, (%rcx) 6399; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6400; AVX2-FP-NEXT: vmovaps %ymm6, 96(%r8) 6401; AVX2-FP-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload 6402; AVX2-FP-NEXT: vmovaps %ymm6, 32(%r8) 6403; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6404; AVX2-FP-NEXT: vmovaps %ymm6, 64(%r8) 6405; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6406; AVX2-FP-NEXT: vmovaps %ymm6, (%r8) 6407; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6408; AVX2-FP-NEXT: vmovaps %ymm6, 96(%r9) 6409; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6410; AVX2-FP-NEXT: vmovaps %ymm6, 32(%r9) 6411; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6412; AVX2-FP-NEXT: vmovaps %ymm6, (%r9) 6413; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6414; AVX2-FP-NEXT: vmovaps %ymm6, 64(%r9) 6415; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6416; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rax) 6417; AVX2-FP-NEXT: vmovaps %ymm5, 32(%rax) 6418; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rax) 6419; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) 6420; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6421; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rax) 6422; AVX2-FP-NEXT: vmovaps %ymm3, (%rax) 6423; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rax) 6424; AVX2-FP-NEXT: vmovaps %ymm8, 64(%rax) 6425; AVX2-FP-NEXT: addq $1192, %rsp # imm = 0x4A8 6426; AVX2-FP-NEXT: vzeroupper 6427; AVX2-FP-NEXT: retq 6428; 6429; AVX2-FCP-LABEL: load_i32_stride7_vf32: 6430; AVX2-FCP: # %bb.0: 6431; AVX2-FCP-NEXT: subq $1224, %rsp # imm = 0x4C8 6432; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm10 6433; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 6434; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 6435; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm12 6436; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm7 6437; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm8 6438; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm14 6439; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 6440; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6441; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 6442; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6443; AVX2-FCP-NEXT: vpbroadcastq 80(%rdi), %ymm0 6444; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6445; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] 6446; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm2[6],ymm14[7] 6447; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6448; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 6449; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 6450; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm2 6451; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm3 6452; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6453; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 6454; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 6455; AVX2-FCP-NEXT: vpbroadcastd 196(%rdi), %ymm3 6456; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 6457; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 6458; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6459; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] 6460; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6461; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6462; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 6463; AVX2-FCP-NEXT: vpbroadcastq 528(%rdi), %ymm2 6464; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] 6465; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 6466; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %xmm2 6467; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %xmm3 6468; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6469; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 6470; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 6471; AVX2-FCP-NEXT: vpbroadcastd 644(%rdi), %ymm3 6472; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 6473; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 6474; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6475; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] 6476; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6477; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6478; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 6479; AVX2-FCP-NEXT: vpbroadcastq 304(%rdi), %ymm2 6480; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] 6481; AVX2-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill 6482; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 6483; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm2 6484; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm3 6485; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6486; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 6487; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 6488; AVX2-FCP-NEXT: vpbroadcastd 420(%rdi), %ymm3 6489; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 6490; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 6491; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6492; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm2 6493; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm6 6494; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6],ymm6[7] 6495; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6496; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm3 6497; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6498; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 6499; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm11 6500; AVX2-FCP-NEXT: vpbroadcastq 752(%rdi), %ymm1 6501; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] 6502; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 6503; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %xmm1 6504; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %xmm2 6505; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6506; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] 6507; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6508; AVX2-FCP-NEXT: vpbroadcastd 868(%rdi), %ymm2 6509; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 6510; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 6511; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6512; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] 6513; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6514; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm2 6515; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6516; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 6517; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6518; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] 6519; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] 6520; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] 6521; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm13 6522; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] 6523; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6524; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6525; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] 6526; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6527; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] 6528; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] 6529; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 6530; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 6531; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6532; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 6533; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6534; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm7 6535; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6536; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm2 6537; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6538; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 6539; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 6540; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 6541; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm2 6542; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6543; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] 6544; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] 6545; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6546; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7] 6547; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 6548; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 6549; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6550; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 6551; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6552; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm9 6553; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm15 6554; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] 6555; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6556; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6557; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 6558; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 6559; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm5 6560; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6561; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm5[2,3],ymm11[4,5],ymm5[6,7] 6562; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6563; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] 6564; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6565; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7] 6566; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 6567; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 6568; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6569; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 6570; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6571; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 6572; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 6573; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] 6574; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6575; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6576; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 6577; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7] 6578; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 6579; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6580; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] 6581; AVX2-FCP-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload 6582; AVX2-FCP-NEXT: # ymm4 = mem[0],ymm14[1],mem[2,3,4],ymm14[5],mem[6,7] 6583; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6584; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7] 6585; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm0 6586; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 6587; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6588; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 6589; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 6590; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] 6591; AVX2-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm2 6592; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 6593; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] 6594; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 6595; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] 6596; AVX2-FCP-NEXT: vpbroadcastd 204(%rdi), %ymm4 6597; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] 6598; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 6599; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6600; AVX2-FCP-NEXT: vmovdqa 528(%rdi), %xmm0 6601; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] 6602; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] 6603; AVX2-FCP-NEXT: vpbroadcastd 456(%rdi), %xmm4 6604; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %xmm2 6605; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] 6606; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] 6607; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 6608; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 6609; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm8[0],ymm12[0],ymm8[2],ymm12[2] 6610; AVX2-FCP-NEXT: vpbroadcastd 652(%rdi), %ymm13 6611; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm13[7] 6612; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] 6613; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6614; AVX2-FCP-NEXT: vmovdqa 752(%rdi), %xmm0 6615; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] 6616; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] 6617; AVX2-FCP-NEXT: vpbroadcastd 680(%rdi), %xmm13 6618; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %xmm10 6619; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm10[1],xmm13[2,3] 6620; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7] 6621; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm9[0],ymm15[2],ymm9[2] 6622; AVX2-FCP-NEXT: vpbroadcastd 876(%rdi), %ymm15 6623; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] 6624; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] 6625; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6626; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm0 6627; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 6628; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload 6629; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm13 = ymm6[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] 6630; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7] 6631; AVX2-FCP-NEXT: vpbroadcastd 232(%rdi), %xmm15 6632; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm0 6633; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] 6634; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7] 6635; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6636; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 6637; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] 6638; AVX2-FCP-NEXT: vpbroadcastd 428(%rdi), %ymm14 6639; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] 6640; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] 6641; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6642; AVX2-FCP-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 6643; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] 6644; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] 6645; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] 6646; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] 6647; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] 6648; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 6649; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 6650; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,2],ymm9[1,3],ymm15[4,6],ymm9[5,7] 6651; AVX2-FCP-NEXT: vbroadcastss 208(%rdi), %ymm11 6652; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] 6653; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] 6654; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6655; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6656; AVX2-FCP-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 6657; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] 6658; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] 6659; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] 6660; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] 6661; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] 6662; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm13 6663; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,2],ymm12[1,3],ymm8[4,6],ymm12[5,7] 6664; AVX2-FCP-NEXT: vbroadcastss 656(%rdi), %ymm3 6665; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 6666; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 6667; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6668; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] 6669; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] 6670; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] 6671; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] 6672; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 6673; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm4[1,3],ymm5[4,6],ymm4[5,7] 6674; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm8 6675; AVX2-FCP-NEXT: vbroadcastss 432(%rdi), %ymm2 6676; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 6677; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 6678; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6679; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6680; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 6681; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] 6682; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1,2],mem[3] 6683; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] 6684; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 6685; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 6686; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6687; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6688; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm5[1,3],ymm6[4,6],ymm5[5,7] 6689; AVX2-FCP-NEXT: vbroadcastss 880(%rdi), %ymm2 6690; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 6691; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 6692; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 6693; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,3,0,0] 6694; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6695; AVX2-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 6696; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] 6697; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 6698; AVX2-FCP-NEXT: vpbroadcastd 548(%rdi), %xmm2 6699; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %xmm14 6700; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] 6701; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] 6702; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] 6703; AVX2-FCP-NEXT: vpermd %ymm13, %ymm11, %ymm2 6704; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] 6705; AVX2-FCP-NEXT: vmovaps %ymm12, %ymm13 6706; AVX2-FCP-NEXT: vpbroadcastd 660(%rdi), %ymm3 6707; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 6708; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 6709; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6710; AVX2-FCP-NEXT: vpbroadcastd 100(%rdi), %xmm2 6711; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 6712; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] 6713; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6714; AVX2-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 6715; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] 6716; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm3 6717; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] 6718; AVX2-FCP-NEXT: vmovaps %ymm15, %ymm10 6719; AVX2-FCP-NEXT: vpermd %ymm15, %ymm11, %ymm3 6720; AVX2-FCP-NEXT: vmovaps %ymm9, %ymm7 6721; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] 6722; AVX2-FCP-NEXT: vpbroadcastd 212(%rdi), %ymm4 6723; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] 6724; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 6725; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6726; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6727; AVX2-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 6728; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] 6729; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm3 6730; AVX2-FCP-NEXT: vpbroadcastd 324(%rdi), %xmm4 6731; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm2 6732; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] 6733; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] 6734; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 6735; AVX2-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm4 6736; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] 6737; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm15 6738; AVX2-FCP-NEXT: vpbroadcastd 436(%rdi), %ymm8 6739; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] 6740; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] 6741; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6742; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6743; AVX2-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 6744; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] 6745; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm1 6746; AVX2-FCP-NEXT: vpbroadcastd 772(%rdi), %xmm4 6747; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %xmm3 6748; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] 6749; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] 6750; AVX2-FCP-NEXT: vpermd %ymm6, %ymm11, %ymm4 6751; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 6752; AVX2-FCP-NEXT: vmovaps %ymm5, %ymm12 6753; AVX2-FCP-NEXT: vpbroadcastd 884(%rdi), %ymm8 6754; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] 6755; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] 6756; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6757; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] 6758; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm4 = [1,0,3,3,1,0,7,7] 6759; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm1 6760; AVX2-FCP-NEXT: vbroadcastss 216(%rdi), %ymm6 6761; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] 6762; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm6 6763; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] 6764; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] 6765; AVX2-FCP-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload 6766; AVX2-FCP-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] 6767; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 6768; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] 6769; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6770; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %xmm8 6771; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1,2],xmm14[3] 6772; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] 6773; AVX2-FCP-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 6774; AVX2-FCP-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] 6775; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 6776; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] 6777; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload 6778; AVX2-FCP-NEXT: # ymm5 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] 6779; AVX2-FCP-NEXT: vpermps %ymm5, %ymm4, %ymm5 6780; AVX2-FCP-NEXT: vbroadcastss 664(%rdi), %ymm7 6781; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] 6782; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] 6783; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm13 6784; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] 6785; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] 6786; AVX2-FCP-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 6787; AVX2-FCP-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] 6788; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 6789; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] 6790; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7] 6791; AVX2-FCP-NEXT: vpermps %ymm5, %ymm4, %ymm5 6792; AVX2-FCP-NEXT: vbroadcastss 440(%rdi), %ymm7 6793; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] 6794; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] 6795; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %xmm2 6796; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] 6797; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] 6798; AVX2-FCP-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload 6799; AVX2-FCP-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] 6800; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 6801; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] 6802; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload 6803; AVX2-FCP-NEXT: # ymm7 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] 6804; AVX2-FCP-NEXT: vpermps %ymm7, %ymm4, %ymm4 6805; AVX2-FCP-NEXT: vbroadcastss 888(%rdi), %ymm7 6806; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] 6807; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm4[4,5,6,7] 6808; AVX2-FCP-NEXT: vbroadcastss 584(%rdi), %xmm3 6809; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 6810; AVX2-FCP-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] 6811; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 6812; AVX2-FCP-NEXT: vpermd 640(%rdi), %ymm11, %ymm4 6813; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] 6814; AVX2-FCP-NEXT: vpbroadcastd 528(%rdi), %ymm4 6815; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3] 6816; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload 6817; AVX2-FCP-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] 6818; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload 6819; AVX2-FCP-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] 6820; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 6821; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] 6822; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7] 6823; AVX2-FCP-NEXT: vbroadcastss 808(%rdi), %xmm3 6824; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 6825; AVX2-FCP-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] 6826; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 6827; AVX2-FCP-NEXT: vpermd 864(%rdi), %ymm11, %ymm4 6828; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] 6829; AVX2-FCP-NEXT: vpbroadcastd 752(%rdi), %ymm4 6830; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] 6831; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 6832; AVX2-FCP-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] 6833; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 6834; AVX2-FCP-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] 6835; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 6836; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] 6837; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 6838; AVX2-FCP-NEXT: vbroadcastss 136(%rdi), %xmm3 6839; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 6840; AVX2-FCP-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] 6841; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 6842; AVX2-FCP-NEXT: vpermd 192(%rdi), %ymm11, %ymm4 6843; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] 6844; AVX2-FCP-NEXT: vpbroadcastd 80(%rdi), %ymm4 6845; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3] 6846; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 6847; AVX2-FCP-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] 6848; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 6849; AVX2-FCP-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] 6850; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 6851; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] 6852; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 6853; AVX2-FCP-NEXT: vbroadcastss 360(%rdi), %xmm4 6854; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 6855; AVX2-FCP-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] 6856; AVX2-FCP-NEXT: vpermd 416(%rdi), %ymm11, %ymm6 6857; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 6858; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] 6859; AVX2-FCP-NEXT: vpbroadcastd 304(%rdi), %ymm6 6860; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm13[3] 6861; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload 6862; AVX2-FCP-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] 6863; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload 6864; AVX2-FCP-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] 6865; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 6866; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] 6867; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] 6868; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6869; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%rsi) 6870; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6871; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rsi) 6872; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6873; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%rsi) 6874; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6875; AVX2-FCP-NEXT: vmovaps %ymm6, (%rsi) 6876; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6877; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%rdx) 6878; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6879; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rdx) 6880; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6881; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%rdx) 6882; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6883; AVX2-FCP-NEXT: vmovaps %ymm6, (%rdx) 6884; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6885; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rcx) 6886; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6887; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%rcx) 6888; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6889; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%rcx) 6890; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6891; AVX2-FCP-NEXT: vmovaps %ymm6, (%rcx) 6892; AVX2-FCP-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload 6893; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%r8) 6894; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6895; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%r8) 6896; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6897; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%r8) 6898; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6899; AVX2-FCP-NEXT: vmovaps %ymm6, (%r8) 6900; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6901; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%r9) 6902; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6903; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%r9) 6904; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6905; AVX2-FCP-NEXT: vmovaps %ymm6, (%r9) 6906; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6907; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%r9) 6908; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6909; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rax) 6910; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rax) 6911; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rax) 6912; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) 6913; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6914; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rax) 6915; AVX2-FCP-NEXT: vmovaps %ymm3, (%rax) 6916; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rax) 6917; AVX2-FCP-NEXT: vmovaps %ymm8, 64(%rax) 6918; AVX2-FCP-NEXT: addq $1224, %rsp # imm = 0x4C8 6919; AVX2-FCP-NEXT: vzeroupper 6920; AVX2-FCP-NEXT: retq 6921; 6922; AVX512-LABEL: load_i32_stride7_vf32: 6923; AVX512: # %bb.0: 6924; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 6925; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 6926; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm1 6927; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm0 6928; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm4 6929; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm2 6930; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm5 6931; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm6 6932; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm3 6933; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm13 6934; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm15 6935; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm9 6936; AVX512-NEXT: vmovdqa64 (%rdi), %zmm10 6937; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm11 6938; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm14 6939; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm12 6940; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 6941; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 6942; AVX512-NEXT: vmovdqa64 %zmm12, %zmm17 6943; AVX512-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 6944; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] 6945; AVX512-NEXT: vmovdqa64 %zmm10, %zmm8 6946; AVX512-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 6947; AVX512-NEXT: movw $992, %di # imm = 0x3E0 6948; AVX512-NEXT: kmovw %edi, %k1 6949; AVX512-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} 6950; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 6951; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 6952; AVX512-NEXT: vmovdqa64 %zmm9, %zmm18 6953; AVX512-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 6954; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 6955; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 6956; AVX512-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 6957; AVX512-NEXT: movb $-32, %dil 6958; AVX512-NEXT: kmovw %edi, %k2 6959; AVX512-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} 6960; AVX512-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 6961; AVX512-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 6962; AVX512-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 6963; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 6964; AVX512-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} 6965; AVX512-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} 6966; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 6967; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 6968; AVX512-NEXT: vmovdqa64 %zmm14, %zmm19 6969; AVX512-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 6970; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] 6971; AVX512-NEXT: vmovdqa64 %zmm10, %zmm17 6972; AVX512-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 6973; AVX512-NEXT: movw $480, %di # imm = 0x1E0 6974; AVX512-NEXT: kmovw %edi, %k2 6975; AVX512-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} 6976; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 6977; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 6978; AVX512-NEXT: vmovdqa64 %zmm9, %zmm20 6979; AVX512-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 6980; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 6981; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 6982; AVX512-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 6983; AVX512-NEXT: movw $-512, %di # imm = 0xFE00 6984; AVX512-NEXT: kmovw %edi, %k1 6985; AVX512-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} 6986; AVX512-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 6987; AVX512-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 6988; AVX512-NEXT: vmovdqa64 %zmm4, %zmm20 6989; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 6990; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 6991; AVX512-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} 6992; AVX512-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} 6993; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 6994; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 6995; AVX512-NEXT: vmovdqa64 %zmm14, %zmm22 6996; AVX512-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 6997; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] 6998; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 6999; AVX512-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 7000; AVX512-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} 7001; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 7002; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 7003; AVX512-NEXT: vmovdqa64 %zmm9, %zmm23 7004; AVX512-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 7005; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 7006; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] 7007; AVX512-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 7008; AVX512-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} 7009; AVX512-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 7010; AVX512-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 7011; AVX512-NEXT: vmovdqa64 %zmm4, %zmm23 7012; AVX512-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 7013; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 7014; AVX512-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} 7015; AVX512-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} 7016; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 7017; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7018; AVX512-NEXT: vmovdqa64 %zmm14, %zmm25 7019; AVX512-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 7020; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] 7021; AVX512-NEXT: vmovdqa64 %zmm11, %zmm23 7022; AVX512-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 7023; AVX512-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} 7024; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 7025; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 7026; AVX512-NEXT: vmovdqa64 %zmm15, %zmm26 7027; AVX512-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 7028; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 7029; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 7030; AVX512-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 7031; AVX512-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} 7032; AVX512-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 7033; AVX512-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 7034; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 7035; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 7036; AVX512-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} 7037; AVX512-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} 7038; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 7039; AVX512-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 7040; AVX512-NEXT: vmovdqa64 %zmm15, %zmm25 7041; AVX512-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 7042; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 7043; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 7044; AVX512-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 7045; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 7046; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 7047; AVX512-NEXT: vmovdqa64 %zmm14, %zmm24 7048; AVX512-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 7049; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] 7050; AVX512-NEXT: vmovdqa64 %zmm10, %zmm30 7051; AVX512-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 7052; AVX512-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 7053; AVX512-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} 7054; AVX512-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 7055; AVX512-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 7056; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 7057; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 7058; AVX512-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 7059; AVX512-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} 7060; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 7061; AVX512-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 7062; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 7063; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 7064; AVX512-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 7065; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 7066; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 7067; AVX512-NEXT: vmovdqa64 %zmm14, %zmm29 7068; AVX512-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 7069; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] 7070; AVX512-NEXT: vmovdqa64 %zmm10, %zmm31 7071; AVX512-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 7072; AVX512-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 7073; AVX512-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} 7074; AVX512-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 7075; AVX512-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 7076; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 7077; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 7078; AVX512-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 7079; AVX512-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} 7080; AVX512-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 7081; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 7082; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 7083; AVX512-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 7084; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 7085; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 7086; AVX512-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 7087; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] 7088; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 7089; AVX512-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 7090; AVX512-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} 7091; AVX512-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 7092; AVX512-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 7093; AVX512-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 7094; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 7095; AVX512-NEXT: vinserti32x4 $0, %xmm0, %zmm2, %zmm0 7096; AVX512-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} 7097; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rsi) 7098; AVX512-NEXT: vmovdqa64 %zmm8, (%rsi) 7099; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rdx) 7100; AVX512-NEXT: vmovdqa64 %zmm17, (%rdx) 7101; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rcx) 7102; AVX512-NEXT: vmovdqa64 %zmm20, (%rcx) 7103; AVX512-NEXT: vmovdqa64 %zmm22, 64(%r8) 7104; AVX512-NEXT: vmovdqa64 %zmm23, (%r8) 7105; AVX512-NEXT: vmovdqa64 %zmm25, 64(%r9) 7106; AVX512-NEXT: vmovdqa64 %zmm24, (%r9) 7107; AVX512-NEXT: vmovdqa64 %zmm26, 64(%r10) 7108; AVX512-NEXT: vmovdqa64 %zmm29, (%r10) 7109; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax) 7110; AVX512-NEXT: vmovdqa64 %zmm10, (%rax) 7111; AVX512-NEXT: vzeroupper 7112; AVX512-NEXT: retq 7113; 7114; AVX512-FCP-LABEL: load_i32_stride7_vf32: 7115; AVX512-FCP: # %bb.0: 7116; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 7117; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 7118; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm1 7119; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 7120; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 7121; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 7122; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm5 7123; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6 7124; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 7125; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 7126; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm15 7127; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 7128; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 7129; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 7130; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm14 7131; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 7132; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 7133; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 7134; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 7135; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 7136; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] 7137; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 7138; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 7139; AVX512-FCP-NEXT: movw $992, %di # imm = 0x3E0 7140; AVX512-FCP-NEXT: kmovw %edi, %k1 7141; AVX512-FCP-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} 7142; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 7143; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7144; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 7145; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 7146; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 7147; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 7148; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 7149; AVX512-FCP-NEXT: movb $-32, %dil 7150; AVX512-FCP-NEXT: kmovw %edi, %k2 7151; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} 7152; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 7153; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 7154; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 7155; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 7156; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} 7157; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} 7158; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 7159; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7160; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm19 7161; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 7162; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] 7163; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 7164; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 7165; AVX512-FCP-NEXT: movw $480, %di # imm = 0x1E0 7166; AVX512-FCP-NEXT: kmovw %edi, %k2 7167; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} 7168; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 7169; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 7170; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm20 7171; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 7172; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 7173; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 7174; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 7175; AVX512-FCP-NEXT: movw $-512, %di # imm = 0xFE00 7176; AVX512-FCP-NEXT: kmovw %edi, %k1 7177; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} 7178; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 7179; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 7180; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm20 7181; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 7182; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 7183; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} 7184; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} 7185; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 7186; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7187; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 7188; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 7189; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] 7190; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 7191; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 7192; AVX512-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} 7193; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 7194; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 7195; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm23 7196; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 7197; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 7198; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] 7199; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 7200; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} 7201; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 7202; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 7203; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 7204; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 7205; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 7206; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} 7207; AVX512-FCP-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} 7208; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 7209; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7210; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 7211; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 7212; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] 7213; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 7214; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 7215; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} 7216; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 7217; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 7218; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm26 7219; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 7220; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 7221; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 7222; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 7223; AVX512-FCP-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} 7224; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 7225; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 7226; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 7227; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 7228; AVX512-FCP-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} 7229; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} 7230; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 7231; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 7232; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 7233; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 7234; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 7235; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 7236; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 7237; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 7238; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 7239; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 7240; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 7241; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] 7242; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 7243; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 7244; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 7245; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} 7246; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 7247; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 7248; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 7249; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 7250; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 7251; AVX512-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} 7252; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 7253; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 7254; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 7255; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 7256; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 7257; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 7258; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 7259; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 7260; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 7261; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] 7262; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 7263; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 7264; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 7265; AVX512-FCP-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} 7266; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 7267; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 7268; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 7269; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 7270; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 7271; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} 7272; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 7273; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 7274; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 7275; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 7276; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 7277; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 7278; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 7279; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] 7280; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 7281; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 7282; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} 7283; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 7284; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 7285; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 7286; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 7287; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm2, %zmm0 7288; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} 7289; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) 7290; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) 7291; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) 7292; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) 7293; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) 7294; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) 7295; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) 7296; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r8) 7297; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) 7298; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%r9) 7299; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 64(%r10) 7300; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%r10) 7301; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) 7302; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rax) 7303; AVX512-FCP-NEXT: vzeroupper 7304; AVX512-FCP-NEXT: retq 7305; 7306; AVX512DQ-LABEL: load_i32_stride7_vf32: 7307; AVX512DQ: # %bb.0: 7308; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 7309; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 7310; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm1 7311; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm0 7312; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm4 7313; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm2 7314; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm5 7315; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm6 7316; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm3 7317; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm13 7318; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm15 7319; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm9 7320; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm10 7321; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm11 7322; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm14 7323; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm12 7324; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 7325; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 7326; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm17 7327; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 7328; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] 7329; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm8 7330; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 7331; AVX512DQ-NEXT: movw $992, %di # imm = 0x3E0 7332; AVX512DQ-NEXT: kmovw %edi, %k1 7333; AVX512DQ-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} 7334; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 7335; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7336; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm18 7337; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 7338; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 7339; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 7340; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 7341; AVX512DQ-NEXT: movb $-32, %dil 7342; AVX512DQ-NEXT: kmovw %edi, %k2 7343; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} 7344; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 7345; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 7346; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 7347; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 7348; AVX512DQ-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} 7349; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} 7350; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 7351; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7352; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm19 7353; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 7354; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] 7355; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm17 7356; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 7357; AVX512DQ-NEXT: movw $480, %di # imm = 0x1E0 7358; AVX512DQ-NEXT: kmovw %edi, %k2 7359; AVX512DQ-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} 7360; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 7361; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 7362; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm20 7363; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 7364; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 7365; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 7366; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 7367; AVX512DQ-NEXT: movw $-512, %di # imm = 0xFE00 7368; AVX512DQ-NEXT: kmovw %edi, %k1 7369; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} 7370; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 7371; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 7372; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm20 7373; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 7374; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 7375; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} 7376; AVX512DQ-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} 7377; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 7378; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7379; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm22 7380; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 7381; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] 7382; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm20 7383; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 7384; AVX512DQ-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} 7385; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 7386; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 7387; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm23 7388; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 7389; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 7390; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] 7391; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 7392; AVX512DQ-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} 7393; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 7394; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 7395; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm23 7396; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 7397; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 7398; AVX512DQ-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} 7399; AVX512DQ-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} 7400; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 7401; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7402; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm25 7403; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 7404; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] 7405; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm23 7406; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 7407; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} 7408; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 7409; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 7410; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm26 7411; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 7412; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 7413; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 7414; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 7415; AVX512DQ-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} 7416; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 7417; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 7418; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 7419; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 7420; AVX512DQ-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} 7421; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} 7422; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 7423; AVX512DQ-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 7424; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm25 7425; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 7426; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 7427; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 7428; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 7429; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 7430; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 7431; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm24 7432; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 7433; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] 7434; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm30 7435; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 7436; AVX512DQ-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 7437; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} 7438; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 7439; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 7440; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 7441; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 7442; AVX512DQ-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 7443; AVX512DQ-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} 7444; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm26 7445; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 7446; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 7447; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 7448; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 7449; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 7450; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 7451; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm29 7452; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 7453; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] 7454; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm31 7455; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 7456; AVX512DQ-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 7457; AVX512DQ-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} 7458; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 7459; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 7460; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 7461; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 7462; AVX512DQ-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 7463; AVX512DQ-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} 7464; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 7465; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 7466; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 7467; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 7468; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 7469; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 7470; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 7471; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] 7472; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 7473; AVX512DQ-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 7474; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} 7475; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 7476; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 7477; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 7478; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 7479; AVX512DQ-NEXT: vinserti32x4 $0, %xmm0, %zmm2, %zmm0 7480; AVX512DQ-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} 7481; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rsi) 7482; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi) 7483; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rdx) 7484; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rdx) 7485; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rcx) 7486; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rcx) 7487; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%r8) 7488; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%r8) 7489; AVX512DQ-NEXT: vmovdqa64 %zmm25, 64(%r9) 7490; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%r9) 7491; AVX512DQ-NEXT: vmovdqa64 %zmm26, 64(%r10) 7492; AVX512DQ-NEXT: vmovdqa64 %zmm29, (%r10) 7493; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax) 7494; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rax) 7495; AVX512DQ-NEXT: vzeroupper 7496; AVX512DQ-NEXT: retq 7497; 7498; AVX512DQ-FCP-LABEL: load_i32_stride7_vf32: 7499; AVX512DQ-FCP: # %bb.0: 7500; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 7501; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 7502; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm1 7503; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 7504; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 7505; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 7506; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm5 7507; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6 7508; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 7509; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 7510; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm15 7511; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 7512; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 7513; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 7514; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm14 7515; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 7516; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 7517; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 7518; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 7519; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 7520; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] 7521; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 7522; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 7523; AVX512DQ-FCP-NEXT: movw $992, %di # imm = 0x3E0 7524; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 7525; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} 7526; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 7527; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7528; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 7529; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 7530; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 7531; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 7532; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 7533; AVX512DQ-FCP-NEXT: movb $-32, %dil 7534; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 7535; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} 7536; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 7537; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 7538; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 7539; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 7540; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} 7541; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} 7542; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 7543; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7544; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm19 7545; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 7546; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] 7547; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 7548; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 7549; AVX512DQ-FCP-NEXT: movw $480, %di # imm = 0x1E0 7550; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 7551; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} 7552; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 7553; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 7554; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm20 7555; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 7556; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 7557; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 7558; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 7559; AVX512DQ-FCP-NEXT: movw $-512, %di # imm = 0xFE00 7560; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 7561; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} 7562; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 7563; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 7564; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm20 7565; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 7566; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 7567; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} 7568; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} 7569; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 7570; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7571; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 7572; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 7573; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] 7574; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 7575; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 7576; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} 7577; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 7578; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 7579; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm23 7580; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 7581; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 7582; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] 7583; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 7584; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} 7585; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 7586; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 7587; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 7588; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 7589; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 7590; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} 7591; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} 7592; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 7593; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7594; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 7595; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 7596; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] 7597; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 7598; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 7599; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} 7600; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 7601; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 7602; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm26 7603; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 7604; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 7605; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 7606; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 7607; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} 7608; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 7609; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 7610; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 7611; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 7612; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} 7613; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} 7614; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 7615; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 7616; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 7617; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 7618; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 7619; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 7620; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 7621; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 7622; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 7623; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 7624; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 7625; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] 7626; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 7627; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 7628; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 7629; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} 7630; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 7631; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 7632; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 7633; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 7634; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 7635; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} 7636; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 7637; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 7638; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 7639; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 7640; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 7641; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 7642; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 7643; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 7644; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 7645; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] 7646; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 7647; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 7648; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 7649; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} 7650; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 7651; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 7652; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 7653; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 7654; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 7655; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} 7656; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 7657; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 7658; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 7659; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 7660; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 7661; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 7662; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 7663; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] 7664; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 7665; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 7666; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} 7667; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 7668; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 7669; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 7670; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 7671; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm2, %zmm0 7672; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} 7673; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) 7674; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) 7675; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) 7676; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) 7677; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) 7678; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) 7679; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) 7680; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%r8) 7681; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) 7682; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%r9) 7683; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%r10) 7684; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%r10) 7685; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) 7686; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax) 7687; AVX512DQ-FCP-NEXT: vzeroupper 7688; AVX512DQ-FCP-NEXT: retq 7689; 7690; AVX512BW-LABEL: load_i32_stride7_vf32: 7691; AVX512BW: # %bb.0: 7692; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 7693; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 7694; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm1 7695; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 7696; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 7697; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 7698; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm5 7699; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm6 7700; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm3 7701; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm13 7702; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 7703; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9 7704; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 7705; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11 7706; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm14 7707; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12 7708; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 7709; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 7710; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 7711; AVX512BW-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 7712; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] 7713; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 7714; AVX512BW-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 7715; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 7716; AVX512BW-NEXT: kmovd %edi, %k1 7717; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} 7718; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 7719; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7720; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 7721; AVX512BW-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 7722; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 7723; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 7724; AVX512BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 7725; AVX512BW-NEXT: movb $-32, %dil 7726; AVX512BW-NEXT: kmovd %edi, %k2 7727; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} 7728; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 7729; AVX512BW-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 7730; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 7731; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 7732; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} 7733; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} 7734; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 7735; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7736; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm19 7737; AVX512BW-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 7738; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] 7739; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 7740; AVX512BW-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 7741; AVX512BW-NEXT: movw $480, %di # imm = 0x1E0 7742; AVX512BW-NEXT: kmovd %edi, %k2 7743; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} 7744; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 7745; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 7746; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm20 7747; AVX512BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 7748; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 7749; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 7750; AVX512BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 7751; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00 7752; AVX512BW-NEXT: kmovd %edi, %k1 7753; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} 7754; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 7755; AVX512BW-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 7756; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm20 7757; AVX512BW-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 7758; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 7759; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} 7760; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} 7761; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 7762; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7763; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm22 7764; AVX512BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 7765; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] 7766; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 7767; AVX512BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 7768; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} 7769; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 7770; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 7771; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm23 7772; AVX512BW-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 7773; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 7774; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] 7775; AVX512BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 7776; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} 7777; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 7778; AVX512BW-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 7779; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm23 7780; AVX512BW-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 7781; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 7782; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} 7783; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} 7784; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 7785; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7786; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 7787; AVX512BW-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 7788; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] 7789; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm23 7790; AVX512BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 7791; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} 7792; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 7793; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 7794; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm26 7795; AVX512BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 7796; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 7797; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 7798; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 7799; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} 7800; AVX512BW-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 7801; AVX512BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 7802; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 7803; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 7804; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} 7805; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} 7806; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 7807; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 7808; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25 7809; AVX512BW-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 7810; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 7811; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 7812; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 7813; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 7814; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 7815; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 7816; AVX512BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 7817; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] 7818; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm30 7819; AVX512BW-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 7820; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 7821; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} 7822; AVX512BW-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 7823; AVX512BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 7824; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 7825; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 7826; AVX512BW-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 7827; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} 7828; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 7829; AVX512BW-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 7830; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 7831; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 7832; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 7833; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 7834; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 7835; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm29 7836; AVX512BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 7837; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] 7838; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm31 7839; AVX512BW-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 7840; AVX512BW-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 7841; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} 7842; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 7843; AVX512BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 7844; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 7845; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 7846; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 7847; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} 7848; AVX512BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 7849; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 7850; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 7851; AVX512BW-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 7852; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 7853; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 7854; AVX512BW-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 7855; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] 7856; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 7857; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 7858; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} 7859; AVX512BW-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 7860; AVX512BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 7861; AVX512BW-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 7862; AVX512BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 7863; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm2, %zmm0 7864; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} 7865; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) 7866; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) 7867; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rdx) 7868; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rdx) 7869; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) 7870; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rcx) 7871; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) 7872; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8) 7873; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%r9) 7874; AVX512BW-NEXT: vmovdqa64 %zmm24, (%r9) 7875; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r10) 7876; AVX512BW-NEXT: vmovdqa64 %zmm29, (%r10) 7877; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) 7878; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) 7879; AVX512BW-NEXT: vzeroupper 7880; AVX512BW-NEXT: retq 7881; 7882; AVX512BW-FCP-LABEL: load_i32_stride7_vf32: 7883; AVX512BW-FCP: # %bb.0: 7884; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 7885; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 7886; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm1 7887; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 7888; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 7889; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 7890; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm5 7891; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6 7892; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 7893; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 7894; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm15 7895; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 7896; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 7897; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 7898; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm14 7899; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 7900; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 7901; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 7902; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 7903; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 7904; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] 7905; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 7906; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 7907; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 7908; AVX512BW-FCP-NEXT: kmovd %edi, %k1 7909; AVX512BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} 7910; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 7911; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7912; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 7913; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 7914; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 7915; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 7916; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 7917; AVX512BW-FCP-NEXT: movb $-32, %dil 7918; AVX512BW-FCP-NEXT: kmovd %edi, %k2 7919; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} 7920; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 7921; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 7922; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 7923; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 7924; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} 7925; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} 7926; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 7927; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7928; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm19 7929; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 7930; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] 7931; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 7932; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 7933; AVX512BW-FCP-NEXT: movw $480, %di # imm = 0x1E0 7934; AVX512BW-FCP-NEXT: kmovd %edi, %k2 7935; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} 7936; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 7937; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 7938; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm20 7939; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 7940; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 7941; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 7942; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 7943; AVX512BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00 7944; AVX512BW-FCP-NEXT: kmovd %edi, %k1 7945; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} 7946; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 7947; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 7948; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm20 7949; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 7950; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 7951; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} 7952; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} 7953; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 7954; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7955; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 7956; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 7957; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] 7958; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 7959; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 7960; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} 7961; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 7962; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 7963; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm23 7964; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 7965; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 7966; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] 7967; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 7968; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} 7969; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 7970; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 7971; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 7972; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 7973; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 7974; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} 7975; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} 7976; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 7977; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 7978; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 7979; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 7980; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] 7981; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 7982; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 7983; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} 7984; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 7985; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 7986; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm26 7987; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 7988; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 7989; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 7990; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 7991; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} 7992; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 7993; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 7994; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 7995; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 7996; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} 7997; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} 7998; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 7999; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 8000; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 8001; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 8002; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 8003; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 8004; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 8005; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 8006; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 8007; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 8008; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 8009; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] 8010; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 8011; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 8012; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 8013; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} 8014; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 8015; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 8016; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 8017; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 8018; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 8019; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} 8020; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 8021; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 8022; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 8023; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 8024; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 8025; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 8026; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 8027; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 8028; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 8029; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] 8030; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 8031; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 8032; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 8033; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} 8034; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 8035; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 8036; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 8037; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 8038; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 8039; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} 8040; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 8041; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 8042; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 8043; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 8044; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 8045; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 8046; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 8047; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] 8048; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 8049; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 8050; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} 8051; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 8052; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 8053; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 8054; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 8055; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm2, %zmm0 8056; AVX512BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} 8057; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) 8058; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) 8059; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) 8060; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) 8061; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) 8062; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) 8063; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) 8064; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8) 8065; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) 8066; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9) 8067; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r10) 8068; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, (%r10) 8069; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) 8070; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax) 8071; AVX512BW-FCP-NEXT: vzeroupper 8072; AVX512BW-FCP-NEXT: retq 8073; 8074; AVX512DQ-BW-LABEL: load_i32_stride7_vf32: 8075; AVX512DQ-BW: # %bb.0: 8076; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 8077; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 8078; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm1 8079; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm0 8080; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm4 8081; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm2 8082; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm5 8083; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm6 8084; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm3 8085; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm13 8086; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm15 8087; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm9 8088; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm10 8089; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm11 8090; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm14 8091; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm12 8092; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 8093; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 8094; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm17 8095; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 8096; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] 8097; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm8 8098; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 8099; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 8100; AVX512DQ-BW-NEXT: kmovd %edi, %k1 8101; AVX512DQ-BW-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} 8102; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 8103; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8104; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm18 8105; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 8106; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 8107; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 8108; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 8109; AVX512DQ-BW-NEXT: movb $-32, %dil 8110; AVX512DQ-BW-NEXT: kmovd %edi, %k2 8111; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} 8112; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 8113; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 8114; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 8115; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 8116; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} 8117; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} 8118; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 8119; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8120; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm19 8121; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 8122; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] 8123; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm17 8124; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 8125; AVX512DQ-BW-NEXT: movw $480, %di # imm = 0x1E0 8126; AVX512DQ-BW-NEXT: kmovd %edi, %k2 8127; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} 8128; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 8129; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 8130; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm20 8131; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 8132; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 8133; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 8134; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 8135; AVX512DQ-BW-NEXT: movw $-512, %di # imm = 0xFE00 8136; AVX512DQ-BW-NEXT: kmovd %edi, %k1 8137; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} 8138; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 8139; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 8140; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm20 8141; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 8142; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 8143; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} 8144; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} 8145; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 8146; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8147; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm22 8148; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 8149; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] 8150; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm20 8151; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 8152; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} 8153; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 8154; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 8155; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm23 8156; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 8157; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 8158; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] 8159; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 8160; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} 8161; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 8162; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 8163; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm23 8164; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 8165; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 8166; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} 8167; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} 8168; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 8169; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8170; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm25 8171; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 8172; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] 8173; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm23 8174; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 8175; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} 8176; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 8177; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 8178; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm26 8179; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 8180; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 8181; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 8182; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 8183; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} 8184; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 8185; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 8186; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 8187; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 8188; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} 8189; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} 8190; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 8191; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 8192; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm25 8193; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 8194; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 8195; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 8196; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 8197; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 8198; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 8199; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm24 8200; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 8201; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] 8202; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm30 8203; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 8204; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 8205; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} 8206; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 8207; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 8208; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 8209; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 8210; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 8211; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} 8212; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm26 8213; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 8214; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 8215; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 8216; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 8217; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 8218; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 8219; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm29 8220; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 8221; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] 8222; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm31 8223; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 8224; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 8225; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} 8226; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 8227; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 8228; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 8229; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 8230; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 8231; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} 8232; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 8233; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 8234; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 8235; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 8236; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 8237; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 8238; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 8239; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] 8240; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 8241; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 8242; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} 8243; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 8244; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 8245; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 8246; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 8247; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm0, %zmm2, %zmm0 8248; AVX512DQ-BW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} 8249; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) 8250; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi) 8251; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 64(%rdx) 8252; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rdx) 8253; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) 8254; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%rcx) 8255; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%r8) 8256; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%r8) 8257; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 64(%r9) 8258; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%r9) 8259; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 64(%r10) 8260; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, (%r10) 8261; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) 8262; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rax) 8263; AVX512DQ-BW-NEXT: vzeroupper 8264; AVX512DQ-BW-NEXT: retq 8265; 8266; AVX512DQ-BW-FCP-LABEL: load_i32_stride7_vf32: 8267; AVX512DQ-BW-FCP: # %bb.0: 8268; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 8269; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 8270; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm1 8271; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 8272; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 8273; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 8274; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm5 8275; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6 8276; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 8277; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 8278; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm15 8279; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 8280; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 8281; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 8282; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm14 8283; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 8284; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 8285; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 8286; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 8287; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 8288; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] 8289; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 8290; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 8291; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 8292; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 8293; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} 8294; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 8295; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8296; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 8297; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 8298; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 8299; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 8300; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 8301; AVX512DQ-BW-FCP-NEXT: movb $-32, %dil 8302; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 8303; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} 8304; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 8305; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 8306; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 8307; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 8308; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} 8309; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} 8310; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 8311; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8312; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm19 8313; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 8314; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] 8315; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 8316; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 8317; AVX512DQ-BW-FCP-NEXT: movw $480, %di # imm = 0x1E0 8318; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 8319; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} 8320; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 8321; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 8322; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm20 8323; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 8324; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 8325; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 8326; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 8327; AVX512DQ-BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00 8328; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 8329; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} 8330; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 8331; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 8332; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm20 8333; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 8334; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 8335; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} 8336; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} 8337; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 8338; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8339; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 8340; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 8341; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] 8342; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 8343; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 8344; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} 8345; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 8346; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 8347; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm23 8348; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 8349; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 8350; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] 8351; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 8352; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} 8353; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 8354; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 8355; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 8356; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 8357; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 8358; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} 8359; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} 8360; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 8361; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 8362; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 8363; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 8364; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] 8365; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 8366; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 8367; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} 8368; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 8369; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 8370; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm26 8371; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 8372; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 8373; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 8374; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 8375; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} 8376; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 8377; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 8378; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 8379; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 8380; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} 8381; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} 8382; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 8383; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 8384; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 8385; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 8386; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 8387; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 8388; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 8389; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 8390; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 8391; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 8392; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 8393; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] 8394; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 8395; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 8396; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 8397; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} 8398; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 8399; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 8400; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 8401; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 8402; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 8403; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} 8404; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 8405; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 8406; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 8407; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] 8408; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 8409; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 8410; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 8411; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 8412; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 8413; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] 8414; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 8415; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 8416; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 8417; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} 8418; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 8419; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 8420; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 8421; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 8422; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 8423; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} 8424; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 8425; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 8426; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 8427; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 8428; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 8429; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 8430; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 8431; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] 8432; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 8433; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 8434; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} 8435; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 8436; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 8437; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 8438; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 8439; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm2, %zmm0 8440; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} 8441; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) 8442; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) 8443; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) 8444; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) 8445; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) 8446; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) 8447; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) 8448; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8) 8449; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) 8450; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9) 8451; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r10) 8452; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, (%r10) 8453; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) 8454; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax) 8455; AVX512DQ-BW-FCP-NEXT: vzeroupper 8456; AVX512DQ-BW-FCP-NEXT: retq 8457 %wide.vec = load <224 x i32>, ptr %in.vec, align 64 8458 %strided.vec0 = shufflevector <224 x i32> %wide.vec, <224 x i32> poison, <32 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217> 8459 %strided.vec1 = shufflevector <224 x i32> %wide.vec, <224 x i32> poison, <32 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218> 8460 %strided.vec2 = shufflevector <224 x i32> %wide.vec, <224 x i32> poison, <32 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219> 8461 %strided.vec3 = shufflevector <224 x i32> %wide.vec, <224 x i32> poison, <32 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220> 8462 %strided.vec4 = shufflevector <224 x i32> %wide.vec, <224 x i32> poison, <32 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221> 8463 %strided.vec5 = shufflevector <224 x i32> %wide.vec, <224 x i32> poison, <32 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222> 8464 %strided.vec6 = shufflevector <224 x i32> %wide.vec, <224 x i32> poison, <32 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223> 8465 store <32 x i32> %strided.vec0, ptr %out.vec0, align 64 8466 store <32 x i32> %strided.vec1, ptr %out.vec1, align 64 8467 store <32 x i32> %strided.vec2, ptr %out.vec2, align 64 8468 store <32 x i32> %strided.vec3, ptr %out.vec3, align 64 8469 store <32 x i32> %strided.vec4, ptr %out.vec4, align 64 8470 store <32 x i32> %strided.vec5, ptr %out.vec5, align 64 8471 store <32 x i32> %strided.vec6, ptr %out.vec6, align 64 8472 ret void 8473} 8474 8475define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { 8476; SSE-LABEL: load_i32_stride7_vf64: 8477; SSE: # %bb.0: 8478; SSE-NEXT: subq $2456, %rsp # imm = 0x998 8479; SSE-NEXT: movdqa 1088(%rdi), %xmm3 8480; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8481; SSE-NEXT: movdqa 1056(%rdi), %xmm4 8482; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8483; SSE-NEXT: movdqa 1008(%rdi), %xmm9 8484; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8485; SSE-NEXT: movdqa 1024(%rdi), %xmm5 8486; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8487; SSE-NEXT: movdqa 640(%rdi), %xmm13 8488; SSE-NEXT: movdqa 608(%rdi), %xmm6 8489; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8490; SSE-NEXT: movdqa 560(%rdi), %xmm10 8491; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8492; SSE-NEXT: movdqa 576(%rdi), %xmm7 8493; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8494; SSE-NEXT: movdqa 192(%rdi), %xmm2 8495; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8496; SSE-NEXT: movdqa 160(%rdi), %xmm15 8497; SSE-NEXT: movdqa 112(%rdi), %xmm1 8498; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8499; SSE-NEXT: movdqa 128(%rdi), %xmm0 8500; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8501; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 8502; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 8503; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] 8504; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8505; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 8506; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 8507; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8508; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] 8509; SSE-NEXT: movdqa %xmm10, %xmm1 8510; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 8511; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] 8512; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] 8513; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8514; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 8515; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8516; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] 8517; SSE-NEXT: movdqa %xmm9, %xmm1 8518; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 8519; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] 8520; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 8521; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 8522; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8523; SSE-NEXT: movdqa 1456(%rdi), %xmm1 8524; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8525; SSE-NEXT: movdqa 1472(%rdi), %xmm0 8526; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8527; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 8528; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 8529; SSE-NEXT: movdqa 1536(%rdi), %xmm2 8530; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8531; SSE-NEXT: movdqa 1504(%rdi), %xmm0 8532; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8533; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 8534; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 8535; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 8536; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8537; SSE-NEXT: movdqa (%rdi), %xmm1 8538; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8539; SSE-NEXT: movdqa 16(%rdi), %xmm0 8540; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8541; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 8542; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 8543; SSE-NEXT: movdqa 80(%rdi), %xmm2 8544; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8545; SSE-NEXT: movdqa 48(%rdi), %xmm0 8546; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8547; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 8548; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 8549; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 8550; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8551; SSE-NEXT: movdqa 448(%rdi), %xmm1 8552; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8553; SSE-NEXT: movdqa 464(%rdi), %xmm0 8554; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8555; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 8556; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 8557; SSE-NEXT: movdqa 528(%rdi), %xmm2 8558; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8559; SSE-NEXT: movdqa 496(%rdi), %xmm0 8560; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8561; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 8562; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 8563; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 8564; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8565; SSE-NEXT: movdqa 896(%rdi), %xmm1 8566; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8567; SSE-NEXT: movdqa 912(%rdi), %xmm0 8568; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8569; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 8570; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 8571; SSE-NEXT: movdqa 976(%rdi), %xmm2 8572; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8573; SSE-NEXT: movdqa 944(%rdi), %xmm0 8574; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8575; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 8576; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 8577; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 8578; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8579; SSE-NEXT: movdqa 1344(%rdi), %xmm1 8580; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8581; SSE-NEXT: movdqa 1360(%rdi), %xmm0 8582; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8583; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 8584; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 8585; SSE-NEXT: movdqa 1424(%rdi), %xmm2 8586; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8587; SSE-NEXT: movdqa 1392(%rdi), %xmm0 8588; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8589; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 8590; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 8591; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 8592; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8593; SSE-NEXT: movdqa 336(%rdi), %xmm12 8594; SSE-NEXT: movdqa 352(%rdi), %xmm0 8595; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8596; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 8597; SSE-NEXT: movdqa %xmm12, %xmm5 8598; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8599; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 8600; SSE-NEXT: movdqa 416(%rdi), %xmm4 8601; SSE-NEXT: movdqa 384(%rdi), %xmm10 8602; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] 8603; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8604; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 8605; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8606; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] 8607; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8608; SSE-NEXT: movdqa 784(%rdi), %xmm6 8609; SSE-NEXT: movdqa %xmm6, (%rsp) # 16-byte Spill 8610; SSE-NEXT: movdqa 800(%rdi), %xmm0 8611; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8612; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] 8613; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 8614; SSE-NEXT: movdqa 864(%rdi), %xmm0 8615; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8616; SSE-NEXT: movdqa 832(%rdi), %xmm1 8617; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8618; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] 8619; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 8620; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] 8621; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8622; SSE-NEXT: movdqa 1232(%rdi), %xmm6 8623; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8624; SSE-NEXT: movdqa 1248(%rdi), %xmm0 8625; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8626; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] 8627; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 8628; SSE-NEXT: movdqa 1312(%rdi), %xmm0 8629; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8630; SSE-NEXT: movdqa 1280(%rdi), %xmm1 8631; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8632; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] 8633; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 8634; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] 8635; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8636; SSE-NEXT: movdqa 1680(%rdi), %xmm6 8637; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8638; SSE-NEXT: movdqa 1696(%rdi), %xmm0 8639; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8640; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] 8641; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 8642; SSE-NEXT: movdqa 1760(%rdi), %xmm1 8643; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8644; SSE-NEXT: movdqa 1728(%rdi), %xmm0 8645; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8646; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] 8647; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] 8648; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] 8649; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8650; SSE-NEXT: movdqa 224(%rdi), %xmm8 8651; SSE-NEXT: movdqa 240(%rdi), %xmm0 8652; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8653; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] 8654; SSE-NEXT: movdqa %xmm8, %xmm6 8655; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8656; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 8657; SSE-NEXT: movdqa 304(%rdi), %xmm2 8658; SSE-NEXT: movdqa 272(%rdi), %xmm3 8659; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] 8660; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8661; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] 8662; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8663; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] 8664; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8665; SSE-NEXT: movdqa 672(%rdi), %xmm11 8666; SSE-NEXT: movdqa 688(%rdi), %xmm0 8667; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8668; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] 8669; SSE-NEXT: movdqa %xmm11, %xmm7 8670; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] 8671; SSE-NEXT: movdqa 752(%rdi), %xmm14 8672; SSE-NEXT: movdqa 720(%rdi), %xmm1 8673; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] 8674; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8675; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] 8676; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] 8677; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8678; SSE-NEXT: movdqa 1120(%rdi), %xmm9 8679; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8680; SSE-NEXT: movdqa 1136(%rdi), %xmm0 8681; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8682; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] 8683; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] 8684; SSE-NEXT: movdqa 1200(%rdi), %xmm5 8685; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8686; SSE-NEXT: movdqa 1168(%rdi), %xmm6 8687; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] 8688; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8689; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 8690; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] 8691; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8692; SSE-NEXT: movdqa 1568(%rdi), %xmm9 8693; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8694; SSE-NEXT: movdqa 1584(%rdi), %xmm0 8695; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8696; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] 8697; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] 8698; SSE-NEXT: movdqa 1648(%rdi), %xmm5 8699; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8700; SSE-NEXT: movdqa 1616(%rdi), %xmm0 8701; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8702; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 8703; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 8704; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] 8705; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8706; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8707; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,2,2,2] 8708; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3] 8709; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8710; SSE-NEXT: # xmm7 = mem[1,1,1,1] 8711; SSE-NEXT: movdqa 144(%rdi), %xmm0 8712; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8713; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] 8714; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm7[0],xmm15[1] 8715; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8716; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 8717; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,2,2,2] 8718; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 8719; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] 8720; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8721; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] 8722; SSE-NEXT: movdqa 32(%rdi), %xmm5 8723; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8724; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] 8725; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] 8726; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8727; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] 8728; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] 8729; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[1,1,1,1] 8730; SSE-NEXT: movdqa 368(%rdi), %xmm4 8731; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8732; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] 8733; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1] 8734; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8735; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] 8736; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] 8737; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,1,1] 8738; SSE-NEXT: movdqa 256(%rdi), %xmm2 8739; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8740; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] 8741; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] 8742; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8743; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,2,2,2] 8744; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8745; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] 8746; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8747; SSE-NEXT: # xmm7 = mem[1,1,1,1] 8748; SSE-NEXT: movdqa 592(%rdi), %xmm2 8749; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8750; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] 8751; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] 8752; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8753; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8754; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] 8755; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8756; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] 8757; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8758; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,1,1] 8759; SSE-NEXT: movdqa 480(%rdi), %xmm5 8760; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8761; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] 8762; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] 8763; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8764; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8765; SSE-NEXT: # xmm7 = mem[2,2,2,2] 8766; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8767; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] 8768; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload 8769; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] 8770; SSE-NEXT: movdqa 816(%rdi), %xmm5 8771; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8772; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] 8773; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] 8774; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8775; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8776; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,2,2,2] 8777; SSE-NEXT: movdqa %xmm1, %xmm2 8778; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] 8779; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,1,1,1] 8780; SSE-NEXT: movdqa %xmm11, %xmm12 8781; SSE-NEXT: movdqa 704(%rdi), %xmm1 8782; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8783; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] 8784; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] 8785; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8786; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8787; SSE-NEXT: # xmm7 = mem[2,2,2,2] 8788; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8789; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] 8790; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8791; SSE-NEXT: # xmm7 = mem[1,1,1,1] 8792; SSE-NEXT: movdqa 1040(%rdi), %xmm2 8793; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8794; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] 8795; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] 8796; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8797; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 8798; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] 8799; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8800; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] 8801; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8802; SSE-NEXT: # xmm7 = mem[1,1,1,1] 8803; SSE-NEXT: movdqa 928(%rdi), %xmm2 8804; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8805; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] 8806; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] 8807; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8808; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8809; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,2,2,2] 8810; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8811; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] 8812; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8813; SSE-NEXT: # xmm7 = mem[1,1,1,1] 8814; SSE-NEXT: movdqa 1264(%rdi), %xmm1 8815; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8816; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] 8817; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] 8818; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8819; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 8820; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,2,2,2] 8821; SSE-NEXT: movdqa %xmm6, %xmm2 8822; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] 8823; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8824; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] 8825; SSE-NEXT: movdqa 1152(%rdi), %xmm6 8826; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8827; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] 8828; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] 8829; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8830; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8831; SSE-NEXT: # xmm7 = mem[2,2,2,2] 8832; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8833; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] 8834; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8835; SSE-NEXT: # xmm7 = mem[1,1,1,1] 8836; SSE-NEXT: movdqa 1488(%rdi), %xmm6 8837; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8838; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] 8839; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] 8840; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8841; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 8842; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[2,2,2,2] 8843; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8844; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] 8845; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8846; SSE-NEXT: # xmm7 = mem[1,1,1,1] 8847; SSE-NEXT: movdqa 1376(%rdi), %xmm6 8848; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8849; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] 8850; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] 8851; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8852; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 8853; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[2,2,2,2] 8854; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8855; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] 8856; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8857; SSE-NEXT: # xmm7 = mem[1,1,1,1] 8858; SSE-NEXT: movdqa 1712(%rdi), %xmm6 8859; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8860; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] 8861; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] 8862; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8863; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 8864; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,2,2,2] 8865; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8866; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] 8867; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8868; SSE-NEXT: # xmm7 = mem[1,1,1,1] 8869; SSE-NEXT: movdqa 1600(%rdi), %xmm9 8870; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8871; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 8872; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] 8873; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8874; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] 8875; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 8876; SSE-NEXT: # xmm9 = mem[1,1,1,1] 8877; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 8878; SSE-NEXT: movdqa 64(%rdi), %xmm0 8879; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8880; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[2,3,2,3] 8881; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] 8882; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] 8883; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] 8884; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8885; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8886; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] 8887; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 8888; SSE-NEXT: # xmm9 = mem[1,1,1,1] 8889; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 8890; SSE-NEXT: movdqa 176(%rdi), %xmm0 8891; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8892; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 8893; SSE-NEXT: # xmm9 = mem[2,3,2,3] 8894; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 8895; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 8896; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] 8897; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8898; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8899; SSE-NEXT: # xmm7 = mem[2,3,2,3] 8900; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 8901; SSE-NEXT: # xmm9 = mem[1,1,1,1] 8902; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 8903; SSE-NEXT: movdqa 288(%rdi), %xmm0 8904; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8905; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 8906; SSE-NEXT: # xmm9 = mem[2,3,2,3] 8907; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 8908; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 8909; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] 8910; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8911; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8912; SSE-NEXT: # xmm7 = mem[2,3,2,3] 8913; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 8914; SSE-NEXT: # xmm9 = mem[1,1,1,1] 8915; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 8916; SSE-NEXT: movdqa 400(%rdi), %xmm0 8917; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8918; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 8919; SSE-NEXT: # xmm9 = mem[2,3,2,3] 8920; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 8921; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 8922; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] 8923; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8924; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] 8925; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 8926; SSE-NEXT: # xmm9 = mem[1,1,1,1] 8927; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 8928; SSE-NEXT: movdqa 512(%rdi), %xmm0 8929; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8930; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,3,2,3] 8931; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 8932; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 8933; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] 8934; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8935; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8936; SSE-NEXT: # xmm7 = mem[2,3,2,3] 8937; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 8938; SSE-NEXT: # xmm9 = mem[1,1,1,1] 8939; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 8940; SSE-NEXT: movdqa 624(%rdi), %xmm0 8941; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8942; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 8943; SSE-NEXT: # xmm9 = mem[2,3,2,3] 8944; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 8945; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 8946; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] 8947; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8948; SSE-NEXT: movdqa %xmm12, %xmm15 8949; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,3,2,3] 8950; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 8951; SSE-NEXT: # xmm9 = mem[1,1,1,1] 8952; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 8953; SSE-NEXT: movdqa 736(%rdi), %xmm0 8954; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8955; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[2,3,2,3] 8956; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 8957; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 8958; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] 8959; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8960; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,3,2,3] 8961; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 8962; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,1,1] 8963; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 8964; SSE-NEXT: movdqa 848(%rdi), %xmm0 8965; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8966; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 8967; SSE-NEXT: # xmm9 = mem[2,3,2,3] 8968; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 8969; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 8970; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] 8971; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8972; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8973; SSE-NEXT: # xmm7 = mem[2,3,2,3] 8974; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 8975; SSE-NEXT: # xmm9 = mem[1,1,1,1] 8976; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 8977; SSE-NEXT: movdqa 960(%rdi), %xmm0 8978; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8979; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] 8980; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 8981; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 8982; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] 8983; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8984; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8985; SSE-NEXT: # xmm7 = mem[2,3,2,3] 8986; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 8987; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] 8988; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 8989; SSE-NEXT: movdqa 1072(%rdi), %xmm0 8990; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8991; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 8992; SSE-NEXT: # xmm9 = mem[2,3,2,3] 8993; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 8994; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 8995; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] 8996; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8997; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] 8998; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8999; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,1,1] 9000; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 9001; SSE-NEXT: movdqa 1184(%rdi), %xmm0 9002; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9003; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,3,2,3] 9004; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 9005; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 9006; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] 9007; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9008; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 9009; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] 9010; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 9011; SSE-NEXT: # xmm9 = mem[1,1,1,1] 9012; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 9013; SSE-NEXT: movdqa 1296(%rdi), %xmm0 9014; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9015; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 9016; SSE-NEXT: # xmm9 = mem[2,3,2,3] 9017; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 9018; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 9019; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] 9020; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9021; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 9022; SSE-NEXT: # xmm7 = mem[2,3,2,3] 9023; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 9024; SSE-NEXT: # xmm9 = mem[1,1,1,1] 9025; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 9026; SSE-NEXT: movdqa 1408(%rdi), %xmm0 9027; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9028; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] 9029; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 9030; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 9031; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] 9032; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9033; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 9034; SSE-NEXT: # xmm7 = mem[2,3,2,3] 9035; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 9036; SSE-NEXT: # xmm9 = mem[1,1,1,1] 9037; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 9038; SSE-NEXT: movdqa 1520(%rdi), %xmm0 9039; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9040; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 9041; SSE-NEXT: # xmm9 = mem[2,3,2,3] 9042; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 9043; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 9044; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] 9045; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9046; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 9047; SSE-NEXT: # xmm7 = mem[2,3,2,3] 9048; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 9049; SSE-NEXT: # xmm9 = mem[1,1,1,1] 9050; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 9051; SSE-NEXT: movdqa 1632(%rdi), %xmm0 9052; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9053; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,3,2,3] 9054; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 9055; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 9056; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] 9057; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9058; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 9059; SSE-NEXT: # xmm7 = mem[2,3,2,3] 9060; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 9061; SSE-NEXT: # xmm9 = mem[1,1,1,1] 9062; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 9063; SSE-NEXT: movdqa 1744(%rdi), %xmm0 9064; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9065; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[2,3,2,3] 9066; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 9067; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 9068; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] 9069; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9070; SSE-NEXT: movdqa 96(%rdi), %xmm0 9071; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9072; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] 9073; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 9074; SSE-NEXT: movdqa %xmm5, %xmm0 9075; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 9076; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 9077; SSE-NEXT: # xmm4 = mem[2,2,3,3] 9078; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 9079; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] 9080; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 9081; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9082; SSE-NEXT: movdqa 208(%rdi), %xmm0 9083; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9084; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] 9085; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9086; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 9087; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] 9088; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 9089; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] 9090; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 9091; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9092; SSE-NEXT: movdqa 320(%rdi), %xmm0 9093; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9094; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] 9095; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 9096; SSE-NEXT: movdqa %xmm13, %xmm0 9097; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 9098; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 9099; SSE-NEXT: # xmm2 = mem[2,2,3,3] 9100; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 9101; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] 9102; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 9103; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9104; SSE-NEXT: movdqa 432(%rdi), %xmm0 9105; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9106; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] 9107; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 9108; SSE-NEXT: movdqa %xmm11, %xmm0 9109; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 9110; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9111; SSE-NEXT: # xmm1 = mem[2,2,3,3] 9112; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 9113; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] 9114; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9115; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9116; SSE-NEXT: movdqa 544(%rdi), %xmm0 9117; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9118; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 9119; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9120; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9121; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9122; SSE-NEXT: # xmm1 = mem[2,2,3,3] 9123; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9124; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] 9125; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9126; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9127; SSE-NEXT: movdqa 656(%rdi), %xmm0 9128; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9129; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 9130; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9131; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9132; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9133; SSE-NEXT: # xmm1 = mem[2,2,3,3] 9134; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9135; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] 9136; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9137; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9138; SSE-NEXT: movdqa 768(%rdi), %xmm0 9139; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9140; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 9141; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9142; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9143; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] 9144; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9145; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] 9146; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9147; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9148; SSE-NEXT: movdqa 880(%rdi), %xmm0 9149; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9150; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 9151; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 9152; SSE-NEXT: movdqa %xmm15, %xmm2 9153; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 9154; SSE-NEXT: pshufd $250, (%rsp), %xmm0 # 16-byte Folded Reload 9155; SSE-NEXT: # xmm0 = mem[2,2,3,3] 9156; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] 9157; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 9158; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9159; SSE-NEXT: movdqa 992(%rdi), %xmm0 9160; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9161; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 9162; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9163; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9164; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9165; SSE-NEXT: # xmm0 = mem[2,2,3,3] 9166; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9167; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 9168; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9169; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9170; SSE-NEXT: movdqa 1104(%rdi), %xmm0 9171; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill 9172; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 9173; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 9174; SSE-NEXT: movdqa %xmm12, %xmm1 9175; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9176; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9177; SSE-NEXT: # xmm0 = mem[2,2,3,3] 9178; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] 9179; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9180; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9181; SSE-NEXT: movdqa 1216(%rdi), %xmm0 9182; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9183; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 9184; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9185; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9186; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9187; SSE-NEXT: # xmm0 = mem[2,2,3,3] 9188; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] 9189; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9190; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9191; SSE-NEXT: movdqa 1328(%rdi), %xmm0 9192; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9193; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 9194; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 9195; SSE-NEXT: movdqa %xmm14, %xmm1 9196; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9197; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] 9198; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9199; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 9200; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9201; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9202; SSE-NEXT: movdqa 1440(%rdi), %xmm0 9203; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9204; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 9205; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 9206; SSE-NEXT: movdqa %xmm9, %xmm1 9207; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9208; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9209; SSE-NEXT: # xmm0 = mem[2,2,3,3] 9210; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9211; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 9212; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9213; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9214; SSE-NEXT: movdqa 1552(%rdi), %xmm0 9215; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9216; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 9217; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 9218; SSE-NEXT: movdqa %xmm6, %xmm1 9219; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9220; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9221; SSE-NEXT: # xmm0 = mem[2,2,3,3] 9222; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9223; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 9224; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9225; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9226; SSE-NEXT: movdqa 1664(%rdi), %xmm0 9227; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9228; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 9229; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9230; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9231; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9232; SSE-NEXT: # xmm0 = mem[2,2,3,3] 9233; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9234; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 9235; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9236; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9237; SSE-NEXT: movdqa 1776(%rdi), %xmm0 9238; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9239; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 9240; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 9241; SSE-NEXT: movdqa %xmm4, %xmm1 9242; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9243; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9244; SSE-NEXT: # xmm0 = mem[2,2,3,3] 9245; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9246; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 9247; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9248; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9249; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] 9250; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 9251; SSE-NEXT: movdqa %xmm7, %xmm1 9252; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9253; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] 9254; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9255; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9256; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9257; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9258; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9259; SSE-NEXT: # xmm0 = mem[3,3,3,3] 9260; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 9261; SSE-NEXT: movdqa %xmm5, %xmm1 9262; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9263; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9264; SSE-NEXT: # xmm0 = mem[2,2,3,3] 9265; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9266; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9267; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9268; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9269; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9270; SSE-NEXT: # xmm0 = mem[3,3,3,3] 9271; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 9272; SSE-NEXT: movdqa %xmm8, %xmm1 9273; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9274; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] 9275; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9276; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9277; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9278; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9279; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] 9280; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 9281; SSE-NEXT: movdqa %xmm10, %xmm1 9282; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9283; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] 9284; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 9285; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 9286; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9287; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9288; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9289; SSE-NEXT: # xmm0 = mem[3,3,3,3] 9290; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 9291; SSE-NEXT: movdqa %xmm11, %xmm1 9292; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9293; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9294; SSE-NEXT: # xmm0 = mem[2,2,3,3] 9295; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9296; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9297; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9298; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9299; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9300; SSE-NEXT: # xmm0 = mem[3,3,3,3] 9301; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 9302; SSE-NEXT: movdqa %xmm13, %xmm1 9303; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9304; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9305; SSE-NEXT: # xmm0 = mem[2,2,3,3] 9306; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9307; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9308; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9309; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9310; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9311; SSE-NEXT: # xmm0 = mem[3,3,3,3] 9312; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9313; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9314; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9315; SSE-NEXT: # xmm0 = mem[2,2,3,3] 9316; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9317; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9318; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9319; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9320; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9321; SSE-NEXT: # xmm0 = mem[3,3,3,3] 9322; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9323; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9324; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] 9325; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9326; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9327; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9328; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9329; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9330; SSE-NEXT: # xmm0 = mem[3,3,3,3] 9331; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9332; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9333; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 9334; SSE-NEXT: # xmm15 = mem[2,2,3,3] 9335; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 9336; SSE-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] 9337; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] 9338; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9339; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9340; SSE-NEXT: # xmm0 = mem[3,3,3,3] 9341; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9342; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9343; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] 9344; SSE-NEXT: movdqa (%rsp), %xmm15 # 16-byte Reload 9345; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] 9346; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9347; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9348; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9349; SSE-NEXT: # xmm0 = mem[3,3,3,3] 9350; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9351; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9352; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9353; SSE-NEXT: # xmm0 = mem[2,2,3,3] 9354; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 9355; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] 9356; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9357; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9358; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9359; SSE-NEXT: # xmm0 = mem[3,3,3,3] 9360; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9361; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9362; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] 9363; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9364; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9365; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9366; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9367; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9368; SSE-NEXT: # xmm0 = mem[3,3,3,3] 9369; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9370; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9371; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] 9372; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 9373; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] 9374; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9375; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9376; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9377; SSE-NEXT: # xmm0 = mem[3,3,3,3] 9378; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9379; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9380; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] 9381; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 9382; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] 9383; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9384; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9385; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] 9386; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9387; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9388; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9389; SSE-NEXT: # xmm0 = mem[2,2,3,3] 9390; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9391; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9392; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9393; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9394; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 9395; SSE-NEXT: # xmm2 = mem[3,3,3,3] 9396; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9397; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 9398; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] 9399; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9400; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9401; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 9402; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9403; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9404; SSE-NEXT: # xmm0 = mem[2,2,2,2] 9405; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9406; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9407; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] 9408; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9409; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9410; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9411; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9412; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9413; SSE-NEXT: # xmm0 = mem[2,2,2,2] 9414; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9415; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9416; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] 9417; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9418; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9419; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9420; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9421; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9422; SSE-NEXT: # xmm0 = mem[2,2,2,2] 9423; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9424; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9425; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] 9426; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9427; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9428; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9429; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9430; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] 9431; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9432; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9433; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] 9434; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9435; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9436; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9437; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9438; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 9439; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] 9440; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9441; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9442; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] 9443; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 9444; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 9445; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9446; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9447; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 9448; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] 9449; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9450; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9451; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] 9452; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 9453; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 9454; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9455; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9456; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9457; SSE-NEXT: # xmm0 = mem[2,2,2,2] 9458; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9459; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9460; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9461; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9462; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9463; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9464; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9465; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9466; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9467; SSE-NEXT: # xmm0 = mem[2,2,2,2] 9468; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9469; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9470; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9471; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9472; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 9473; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] 9474; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9475; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9476; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9477; SSE-NEXT: # xmm0 = mem[2,2,2,2] 9478; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9479; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9480; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9481; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9482; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9483; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9484; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9485; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9486; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] 9487; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9488; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9489; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9490; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9491; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 9492; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 9493; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9494; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9495; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] 9496; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9497; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9498; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9499; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9500; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9501; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9502; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9503; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9504; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9505; SSE-NEXT: # xmm0 = mem[2,2,2,2] 9506; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9507; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9508; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9509; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9510; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9511; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9512; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9513; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9514; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] 9515; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9516; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9517; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9518; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9519; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9520; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9521; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9522; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9523; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] 9524; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9525; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9526; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9527; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9528; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 9529; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 9530; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 9531; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9532; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9533; SSE-NEXT: # xmm0 = mem[2,2,2,2] 9534; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9535; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 9536; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9537; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9538; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9539; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 9540; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 9541; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9542; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9543; SSE-NEXT: # xmm0 = mem[2,2,2,2] 9544; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] 9545; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9546; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9547; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9548; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 9549; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] 9550; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9551; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9552; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9553; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9554; SSE-NEXT: # xmm1 = mem[2,3,2,3] 9555; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9556; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9557; SSE-NEXT: # xmm0 = mem[2,3,2,3] 9558; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 9559; SSE-NEXT: # xmm4 = mem[0,0,1,1] 9560; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 9561; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] 9562; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9563; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9564; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9565; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9566; SSE-NEXT: # xmm1 = mem[2,3,2,3] 9567; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9568; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9569; SSE-NEXT: # xmm0 = mem[2,3,2,3] 9570; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 9571; SSE-NEXT: # xmm4 = mem[0,0,1,1] 9572; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 9573; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] 9574; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9575; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9576; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9577; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9578; SSE-NEXT: # xmm1 = mem[2,3,2,3] 9579; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9580; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9581; SSE-NEXT: # xmm0 = mem[2,3,2,3] 9582; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 9583; SSE-NEXT: # xmm15 = mem[0,0,1,1] 9584; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] 9585; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] 9586; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9587; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9588; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9589; SSE-NEXT: # xmm1 = mem[2,3,2,3] 9590; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9591; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9592; SSE-NEXT: # xmm0 = mem[2,3,2,3] 9593; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 9594; SSE-NEXT: # xmm14 = mem[0,0,1,1] 9595; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] 9596; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] 9597; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] 9598; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9599; SSE-NEXT: # xmm1 = mem[2,3,2,3] 9600; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9601; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] 9602; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 9603; SSE-NEXT: # xmm13 = mem[0,0,1,1] 9604; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] 9605; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] 9606; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] 9607; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9608; SSE-NEXT: # xmm1 = mem[2,3,2,3] 9609; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9610; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] 9611; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 9612; SSE-NEXT: # xmm12 = mem[0,0,1,1] 9613; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] 9614; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] 9615; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9616; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9617; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9618; SSE-NEXT: # xmm1 = mem[2,3,2,3] 9619; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9620; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9621; SSE-NEXT: # xmm0 = mem[2,3,2,3] 9622; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 9623; SSE-NEXT: # xmm11 = mem[0,0,1,1] 9624; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] 9625; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] 9626; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] 9627; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9628; SSE-NEXT: # xmm1 = mem[2,3,2,3] 9629; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9630; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9631; SSE-NEXT: # xmm0 = mem[2,3,2,3] 9632; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 9633; SSE-NEXT: # xmm10 = mem[0,0,1,1] 9634; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] 9635; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] 9636; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9637; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9638; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9639; SSE-NEXT: # xmm1 = mem[2,3,2,3] 9640; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9641; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9642; SSE-NEXT: # xmm0 = mem[2,3,2,3] 9643; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 9644; SSE-NEXT: # xmm9 = mem[0,0,1,1] 9645; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] 9646; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] 9647; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] 9648; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9649; SSE-NEXT: # xmm1 = mem[2,3,2,3] 9650; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9651; SSE-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload 9652; SSE-NEXT: # xmm0 = mem[2,3,2,3] 9653; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 9654; SSE-NEXT: # xmm8 = mem[0,0,1,1] 9655; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] 9656; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] 9657; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9658; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9659; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9660; SSE-NEXT: # xmm1 = mem[2,3,2,3] 9661; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9662; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9663; SSE-NEXT: # xmm0 = mem[2,3,2,3] 9664; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 9665; SSE-NEXT: # xmm7 = mem[0,0,1,1] 9666; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] 9667; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] 9668; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9669; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9670; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9671; SSE-NEXT: # xmm1 = mem[2,3,2,3] 9672; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9673; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9674; SSE-NEXT: # xmm0 = mem[2,3,2,3] 9675; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 9676; SSE-NEXT: # xmm6 = mem[0,0,1,1] 9677; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] 9678; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] 9679; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9680; SSE-NEXT: # xmm0 = mem[1,1,1,1] 9681; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9682; SSE-NEXT: # xmm1 = mem[2,3,2,3] 9683; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9684; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9685; SSE-NEXT: # xmm0 = mem[2,3,2,3] 9686; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 9687; SSE-NEXT: # xmm5 = mem[0,0,1,1] 9688; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 9689; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] 9690; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] 9691; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9692; SSE-NEXT: # xmm1 = mem[2,3,2,3] 9693; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9694; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9695; SSE-NEXT: # xmm0 = mem[2,3,2,3] 9696; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 9697; SSE-NEXT: # xmm4 = mem[0,0,1,1] 9698; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 9699; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] 9700; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9701; SSE-NEXT: # xmm1 = mem[1,1,1,1] 9702; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9703; SSE-NEXT: # xmm0 = mem[2,3,2,3] 9704; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 9705; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9706; SSE-NEXT: # xmm1 = mem[2,3,2,3] 9707; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 9708; SSE-NEXT: # xmm3 = mem[0,0,1,1] 9709; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 9710; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] 9711; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] 9712; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9713; SSE-NEXT: # xmm1 = mem[2,3,2,3] 9714; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9715; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9716; SSE-NEXT: # xmm0 = mem[2,3,2,3] 9717; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 9718; SSE-NEXT: # xmm2 = mem[0,0,1,1] 9719; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 9720; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 9721; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9722; SSE-NEXT: movaps %xmm0, 224(%rsi) 9723; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9724; SSE-NEXT: movaps %xmm0, 160(%rsi) 9725; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9726; SSE-NEXT: movaps %xmm0, 96(%rsi) 9727; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9728; SSE-NEXT: movaps %xmm0, 32(%rsi) 9729; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9730; SSE-NEXT: movaps %xmm0, 240(%rsi) 9731; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9732; SSE-NEXT: movaps %xmm0, 176(%rsi) 9733; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9734; SSE-NEXT: movaps %xmm0, 112(%rsi) 9735; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9736; SSE-NEXT: movaps %xmm0, 48(%rsi) 9737; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9738; SSE-NEXT: movaps %xmm0, 192(%rsi) 9739; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9740; SSE-NEXT: movaps %xmm0, 128(%rsi) 9741; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9742; SSE-NEXT: movaps %xmm0, 64(%rsi) 9743; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9744; SSE-NEXT: movaps %xmm0, (%rsi) 9745; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9746; SSE-NEXT: movaps %xmm0, 208(%rsi) 9747; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9748; SSE-NEXT: movaps %xmm0, 144(%rsi) 9749; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9750; SSE-NEXT: movaps %xmm0, 80(%rsi) 9751; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9752; SSE-NEXT: movaps %xmm0, 16(%rsi) 9753; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9754; SSE-NEXT: movaps %xmm0, 224(%rdx) 9755; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9756; SSE-NEXT: movaps %xmm0, 240(%rdx) 9757; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9758; SSE-NEXT: movaps %xmm0, 192(%rdx) 9759; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9760; SSE-NEXT: movaps %xmm0, 208(%rdx) 9761; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9762; SSE-NEXT: movaps %xmm0, 160(%rdx) 9763; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9764; SSE-NEXT: movaps %xmm0, 176(%rdx) 9765; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9766; SSE-NEXT: movaps %xmm0, 128(%rdx) 9767; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9768; SSE-NEXT: movaps %xmm0, 144(%rdx) 9769; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9770; SSE-NEXT: movaps %xmm0, 96(%rdx) 9771; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9772; SSE-NEXT: movaps %xmm0, 112(%rdx) 9773; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9774; SSE-NEXT: movaps %xmm0, 64(%rdx) 9775; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9776; SSE-NEXT: movaps %xmm0, 80(%rdx) 9777; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9778; SSE-NEXT: movaps %xmm0, 32(%rdx) 9779; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9780; SSE-NEXT: movaps %xmm0, 48(%rdx) 9781; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9782; SSE-NEXT: movaps %xmm0, (%rdx) 9783; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9784; SSE-NEXT: movaps %xmm0, 16(%rdx) 9785; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9786; SSE-NEXT: movaps %xmm0, 240(%rcx) 9787; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9788; SSE-NEXT: movaps %xmm0, 224(%rcx) 9789; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9790; SSE-NEXT: movaps %xmm0, 208(%rcx) 9791; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9792; SSE-NEXT: movaps %xmm0, 192(%rcx) 9793; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9794; SSE-NEXT: movaps %xmm0, 176(%rcx) 9795; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9796; SSE-NEXT: movaps %xmm0, 160(%rcx) 9797; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9798; SSE-NEXT: movaps %xmm0, 144(%rcx) 9799; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9800; SSE-NEXT: movaps %xmm0, 128(%rcx) 9801; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9802; SSE-NEXT: movaps %xmm0, 112(%rcx) 9803; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9804; SSE-NEXT: movaps %xmm0, 96(%rcx) 9805; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9806; SSE-NEXT: movaps %xmm0, 80(%rcx) 9807; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9808; SSE-NEXT: movaps %xmm0, 64(%rcx) 9809; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9810; SSE-NEXT: movaps %xmm0, 48(%rcx) 9811; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9812; SSE-NEXT: movaps %xmm0, 32(%rcx) 9813; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9814; SSE-NEXT: movaps %xmm0, 16(%rcx) 9815; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9816; SSE-NEXT: movaps %xmm0, (%rcx) 9817; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9818; SSE-NEXT: movaps %xmm0, 240(%r8) 9819; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9820; SSE-NEXT: movaps %xmm0, 224(%r8) 9821; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9822; SSE-NEXT: movaps %xmm0, 208(%r8) 9823; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9824; SSE-NEXT: movaps %xmm0, 192(%r8) 9825; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9826; SSE-NEXT: movaps %xmm0, 176(%r8) 9827; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9828; SSE-NEXT: movaps %xmm0, 160(%r8) 9829; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9830; SSE-NEXT: movaps %xmm0, 144(%r8) 9831; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9832; SSE-NEXT: movaps %xmm0, 128(%r8) 9833; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9834; SSE-NEXT: movaps %xmm0, 112(%r8) 9835; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9836; SSE-NEXT: movaps %xmm0, 96(%r8) 9837; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9838; SSE-NEXT: movaps %xmm0, 80(%r8) 9839; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9840; SSE-NEXT: movaps %xmm0, 64(%r8) 9841; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9842; SSE-NEXT: movaps %xmm0, 48(%r8) 9843; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9844; SSE-NEXT: movaps %xmm0, 32(%r8) 9845; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9846; SSE-NEXT: movaps %xmm0, 16(%r8) 9847; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9848; SSE-NEXT: movaps %xmm0, (%r8) 9849; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9850; SSE-NEXT: movaps %xmm0, 240(%r9) 9851; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9852; SSE-NEXT: movaps %xmm0, 224(%r9) 9853; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9854; SSE-NEXT: movaps %xmm0, 208(%r9) 9855; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9856; SSE-NEXT: movaps %xmm0, 192(%r9) 9857; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9858; SSE-NEXT: movaps %xmm0, 176(%r9) 9859; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9860; SSE-NEXT: movaps %xmm0, 160(%r9) 9861; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9862; SSE-NEXT: movaps %xmm0, 144(%r9) 9863; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9864; SSE-NEXT: movaps %xmm0, 128(%r9) 9865; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9866; SSE-NEXT: movaps %xmm0, 112(%r9) 9867; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9868; SSE-NEXT: movaps %xmm0, 96(%r9) 9869; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9870; SSE-NEXT: movaps %xmm0, 80(%r9) 9871; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9872; SSE-NEXT: movaps %xmm0, 64(%r9) 9873; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9874; SSE-NEXT: movaps %xmm0, 48(%r9) 9875; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9876; SSE-NEXT: movaps %xmm0, 32(%r9) 9877; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9878; SSE-NEXT: movaps %xmm0, 16(%r9) 9879; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9880; SSE-NEXT: movaps %xmm0, (%r9) 9881; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 9882; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9883; SSE-NEXT: movaps %xmm0, 240(%rax) 9884; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9885; SSE-NEXT: movaps %xmm0, 224(%rax) 9886; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9887; SSE-NEXT: movaps %xmm0, 208(%rax) 9888; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9889; SSE-NEXT: movaps %xmm0, 192(%rax) 9890; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9891; SSE-NEXT: movaps %xmm0, 176(%rax) 9892; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9893; SSE-NEXT: movaps %xmm0, 160(%rax) 9894; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9895; SSE-NEXT: movaps %xmm0, 144(%rax) 9896; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9897; SSE-NEXT: movaps %xmm0, 128(%rax) 9898; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9899; SSE-NEXT: movaps %xmm0, 112(%rax) 9900; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9901; SSE-NEXT: movaps %xmm0, 96(%rax) 9902; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9903; SSE-NEXT: movaps %xmm0, 80(%rax) 9904; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9905; SSE-NEXT: movaps %xmm0, 64(%rax) 9906; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9907; SSE-NEXT: movaps %xmm0, 48(%rax) 9908; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9909; SSE-NEXT: movaps %xmm0, 32(%rax) 9910; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9911; SSE-NEXT: movaps %xmm0, 16(%rax) 9912; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9913; SSE-NEXT: movaps %xmm0, (%rax) 9914; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 9915; SSE-NEXT: movapd %xmm2, 240(%rax) 9916; SSE-NEXT: movapd %xmm3, 224(%rax) 9917; SSE-NEXT: movapd %xmm4, 208(%rax) 9918; SSE-NEXT: movapd %xmm5, 192(%rax) 9919; SSE-NEXT: movapd %xmm6, 176(%rax) 9920; SSE-NEXT: movapd %xmm7, 160(%rax) 9921; SSE-NEXT: movapd %xmm8, 144(%rax) 9922; SSE-NEXT: movapd %xmm9, 128(%rax) 9923; SSE-NEXT: movapd %xmm10, 112(%rax) 9924; SSE-NEXT: movapd %xmm11, 96(%rax) 9925; SSE-NEXT: movapd %xmm12, 80(%rax) 9926; SSE-NEXT: movapd %xmm13, 64(%rax) 9927; SSE-NEXT: movapd %xmm14, 48(%rax) 9928; SSE-NEXT: movapd %xmm15, 32(%rax) 9929; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9930; SSE-NEXT: movaps %xmm0, 16(%rax) 9931; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9932; SSE-NEXT: movaps %xmm0, (%rax) 9933; SSE-NEXT: addq $2456, %rsp # imm = 0x998 9934; SSE-NEXT: retq 9935; 9936; AVX-LABEL: load_i32_stride7_vf64: 9937; AVX: # %bb.0: 9938; AVX-NEXT: subq $3176, %rsp # imm = 0xC68 9939; AVX-NEXT: vmovaps 704(%rdi), %ymm2 9940; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9941; AVX-NEXT: vmovaps 672(%rdi), %ymm3 9942; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9943; AVX-NEXT: vmovaps 768(%rdi), %ymm11 9944; AVX-NEXT: vmovaps 256(%rdi), %ymm4 9945; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9946; AVX-NEXT: vmovaps 224(%rdi), %ymm1 9947; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9948; AVX-NEXT: vmovaps 320(%rdi), %ymm6 9949; AVX-NEXT: vmovaps 304(%rdi), %xmm0 9950; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9951; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] 9952; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9953; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] 9954; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 9955; AVX-NEXT: vmovaps 224(%rdi), %xmm10 9956; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] 9957; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] 9958; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 9959; AVX-NEXT: vmovaps 384(%rdi), %xmm1 9960; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9961; AVX-NEXT: vmovaps 352(%rdi), %xmm4 9962; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9963; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] 9964; AVX-NEXT: vmovaps 416(%rdi), %xmm4 9965; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9966; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm4[1] 9967; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 9968; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 9969; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9970; AVX-NEXT: vmovaps 752(%rdi), %xmm0 9971; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9972; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] 9973; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9974; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] 9975; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 9976; AVX-NEXT: vmovaps 672(%rdi), %xmm15 9977; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] 9978; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] 9979; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 9980; AVX-NEXT: vmovaps 832(%rdi), %xmm1 9981; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9982; AVX-NEXT: vmovaps 800(%rdi), %xmm2 9983; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9984; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 9985; AVX-NEXT: vmovaps 864(%rdi), %xmm13 9986; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm13[1] 9987; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 9988; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 9989; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9990; AVX-NEXT: vmovaps 1152(%rdi), %ymm1 9991; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9992; AVX-NEXT: vmovaps 1120(%rdi), %ymm0 9993; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9994; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 9995; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 9996; AVX-NEXT: vmovaps 1120(%rdi), %xmm1 9997; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9998; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 9999; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 10000; AVX-NEXT: vmovaps 1216(%rdi), %ymm9 10001; AVX-NEXT: vmovaps 1200(%rdi), %xmm1 10002; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10003; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] 10004; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10005; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 10006; AVX-NEXT: vmovaps 1280(%rdi), %xmm1 10007; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10008; AVX-NEXT: vmovaps 1248(%rdi), %xmm2 10009; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10010; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 10011; AVX-NEXT: vmovaps 1312(%rdi), %xmm2 10012; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10013; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] 10014; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10015; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 10016; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10017; AVX-NEXT: vmovaps 1600(%rdi), %ymm1 10018; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10019; AVX-NEXT: vmovaps 1568(%rdi), %ymm0 10020; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10021; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 10022; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 10023; AVX-NEXT: vmovaps 1568(%rdi), %xmm1 10024; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10025; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 10026; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 10027; AVX-NEXT: vmovaps 1664(%rdi), %ymm2 10028; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10029; AVX-NEXT: vmovaps 1648(%rdi), %xmm1 10030; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10031; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] 10032; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 10033; AVX-NEXT: vmovaps 1728(%rdi), %xmm1 10034; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10035; AVX-NEXT: vmovaps 1696(%rdi), %xmm2 10036; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10037; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 10038; AVX-NEXT: vmovaps 1760(%rdi), %xmm2 10039; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10040; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] 10041; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10042; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 10043; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10044; AVX-NEXT: vmovaps 32(%rdi), %ymm1 10045; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10046; AVX-NEXT: vmovaps (%rdi), %ymm0 10047; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10048; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 10049; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 10050; AVX-NEXT: vmovaps (%rdi), %xmm1 10051; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10052; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 10053; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 10054; AVX-NEXT: vmovaps 96(%rdi), %ymm14 10055; AVX-NEXT: vmovaps 80(%rdi), %xmm1 10056; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10057; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm1[0],ymm14[2],ymm1[2] 10058; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10059; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 10060; AVX-NEXT: vmovaps 160(%rdi), %xmm1 10061; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10062; AVX-NEXT: vmovaps 128(%rdi), %xmm2 10063; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10064; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 10065; AVX-NEXT: vmovaps 192(%rdi), %xmm2 10066; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10067; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] 10068; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10069; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 10070; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10071; AVX-NEXT: vmovaps 480(%rdi), %ymm1 10072; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10073; AVX-NEXT: vmovaps 448(%rdi), %ymm0 10074; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10075; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 10076; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 10077; AVX-NEXT: vmovaps 448(%rdi), %xmm1 10078; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10079; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 10080; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 10081; AVX-NEXT: vmovaps 544(%rdi), %ymm2 10082; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10083; AVX-NEXT: vmovaps 528(%rdi), %xmm1 10084; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10085; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] 10086; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 10087; AVX-NEXT: vmovaps 608(%rdi), %xmm1 10088; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10089; AVX-NEXT: vmovaps 576(%rdi), %xmm2 10090; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10091; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 10092; AVX-NEXT: vmovaps 640(%rdi), %xmm2 10093; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10094; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] 10095; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10096; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 10097; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10098; AVX-NEXT: vmovaps 928(%rdi), %ymm1 10099; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10100; AVX-NEXT: vmovaps 896(%rdi), %ymm0 10101; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10102; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 10103; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 10104; AVX-NEXT: vmovaps 896(%rdi), %xmm12 10105; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] 10106; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 10107; AVX-NEXT: vmovaps 992(%rdi), %ymm5 10108; AVX-NEXT: vmovaps 976(%rdi), %xmm1 10109; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10110; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] 10111; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10112; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 10113; AVX-NEXT: vmovaps 1056(%rdi), %xmm1 10114; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10115; AVX-NEXT: vmovaps 1024(%rdi), %xmm2 10116; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10117; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 10118; AVX-NEXT: vmovaps 1088(%rdi), %xmm8 10119; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[1] 10120; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10121; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10122; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 10123; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10124; AVX-NEXT: vmovaps 1376(%rdi), %ymm1 10125; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10126; AVX-NEXT: vmovaps 1344(%rdi), %ymm0 10127; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10128; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 10129; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 10130; AVX-NEXT: vmovaps 1344(%rdi), %xmm1 10131; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10132; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 10133; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 10134; AVX-NEXT: vmovaps 1440(%rdi), %ymm4 10135; AVX-NEXT: vmovaps 1424(%rdi), %xmm1 10136; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10137; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] 10138; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10139; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 10140; AVX-NEXT: vmovaps 1504(%rdi), %xmm1 10141; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10142; AVX-NEXT: vmovaps 1472(%rdi), %xmm2 10143; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10144; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] 10145; AVX-NEXT: vmovaps 1536(%rdi), %xmm2 10146; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10147; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] 10148; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10149; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 10150; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10151; AVX-NEXT: vmovaps 288(%rdi), %ymm0 10152; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10153; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6] 10154; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 10155; AVX-NEXT: vmovaps 256(%rdi), %xmm1 10156; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10157; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3] 10158; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10159; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] 10160; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 10161; AVX-NEXT: vmovaps 384(%rdi), %ymm1 10162; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10163; AVX-NEXT: vmovaps 352(%rdi), %ymm2 10164; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10165; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] 10166; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] 10167; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 10168; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 10169; AVX-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] 10170; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10171; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 10172; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10173; AVX-NEXT: vmovaps 736(%rdi), %ymm0 10174; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10175; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1],ymm0[2,2],ymm11[5,5],ymm0[6,6] 10176; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 10177; AVX-NEXT: vmovaps 704(%rdi), %xmm1 10178; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10179; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3] 10180; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10181; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] 10182; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 10183; AVX-NEXT: vmovaps 832(%rdi), %ymm1 10184; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10185; AVX-NEXT: vmovaps 800(%rdi), %ymm2 10186; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10187; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] 10188; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] 10189; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 10190; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm13[2] 10191; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10192; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 10193; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10194; AVX-NEXT: vmovaps 1184(%rdi), %ymm0 10195; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10196; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,1],ymm0[2,2],ymm9[5,5],ymm0[6,6] 10197; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 10198; AVX-NEXT: vmovaps 1152(%rdi), %xmm1 10199; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10200; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 10201; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3] 10202; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] 10203; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 10204; AVX-NEXT: vmovaps 1280(%rdi), %ymm1 10205; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10206; AVX-NEXT: vmovaps 1248(%rdi), %ymm2 10207; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10208; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] 10209; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] 10210; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 10211; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 10212; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm9[2] 10213; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10214; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 10215; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10216; AVX-NEXT: vmovaps 1632(%rdi), %ymm0 10217; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10218; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10219; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6] 10220; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 10221; AVX-NEXT: vmovaps 1600(%rdi), %xmm1 10222; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10223; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 10224; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] 10225; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] 10226; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 10227; AVX-NEXT: vmovaps 1728(%rdi), %ymm1 10228; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10229; AVX-NEXT: vmovaps 1696(%rdi), %ymm2 10230; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10231; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] 10232; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] 10233; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 10234; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 10235; AVX-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] 10236; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 10237; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 10238; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10239; AVX-NEXT: vmovaps 64(%rdi), %ymm0 10240; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10241; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1],ymm0[2,2],ymm14[5,5],ymm0[6,6] 10242; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 10243; AVX-NEXT: vmovaps 32(%rdi), %xmm1 10244; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10245; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 10246; AVX-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] 10247; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] 10248; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] 10249; AVX-NEXT: vmovaps 160(%rdi), %ymm2 10250; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10251; AVX-NEXT: vmovaps 128(%rdi), %ymm0 10252; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10253; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[0,1] 10254; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0],ymm2[3,3],ymm0[4,4],ymm2[7,7] 10255; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 10256; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 10257; AVX-NEXT: # xmm2 = zero,xmm2[1,2],mem[0] 10258; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 10259; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 10260; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10261; AVX-NEXT: vmovaps 512(%rdi), %ymm0 10262; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10263; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10264; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6] 10265; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] 10266; AVX-NEXT: vmovaps 480(%rdi), %xmm0 10267; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10268; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 10269; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0],xmm14[1],xmm0[2,3] 10270; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3] 10271; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] 10272; AVX-NEXT: vmovaps 608(%rdi), %ymm0 10273; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10274; AVX-NEXT: vmovaps 576(%rdi), %ymm1 10275; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10276; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm0[0,1] 10277; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7] 10278; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 10279; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 10280; AVX-NEXT: # xmm3 = zero,xmm3[1,2],mem[0] 10281; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 10282; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] 10283; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10284; AVX-NEXT: vmovaps 960(%rdi), %ymm0 10285; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10286; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6] 10287; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] 10288; AVX-NEXT: vmovaps 928(%rdi), %xmm0 10289; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10290; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm12[1],xmm0[2,3] 10291; AVX-NEXT: vmovaps %xmm12, %xmm6 10292; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10293; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3] 10294; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] 10295; AVX-NEXT: vmovaps 1056(%rdi), %ymm0 10296; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10297; AVX-NEXT: vmovaps 1024(%rdi), %ymm1 10298; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10299; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[0,1] 10300; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm1[0,0],ymm5[3,3],ymm1[4,4],ymm5[7,7] 10301; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5 10302; AVX-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm8[2] 10303; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 10304; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] 10305; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10306; AVX-NEXT: vmovaps 1408(%rdi), %ymm0 10307; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10308; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[1,1],ymm0[2,2],ymm4[5,5],ymm0[6,6] 10309; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] 10310; AVX-NEXT: vmovaps 1376(%rdi), %xmm4 10311; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 10312; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm4[0],xmm3[1],xmm4[2,3] 10313; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,0],mem[3,3] 10314; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm5[3,4,5,6,7] 10315; AVX-NEXT: vmovaps 1504(%rdi), %ymm0 10316; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10317; AVX-NEXT: vmovaps 1472(%rdi), %ymm1 10318; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10319; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm0[0,1] 10320; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,0],ymm8[3,3],ymm1[4,4],ymm8[7,7] 10321; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 10322; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload 10323; AVX-NEXT: # xmm8 = zero,xmm8[1,2],mem[0] 10324; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 10325; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5,6,7] 10326; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10327; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm10[2,3,2,3] 10328; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload 10329; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] 10330; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 10331; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload 10332; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] 10333; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10334; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] 10335; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] 10336; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 10337; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10338; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] 10339; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 10340; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload 10341; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3] 10342; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 10343; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] 10344; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10345; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm15[2,3,2,3] 10346; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload 10347; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] 10348; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 10349; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload 10350; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] 10351; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10352; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] 10353; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] 10354; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10355; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10356; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] 10357; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 10358; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3] 10359; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 10360; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] 10361; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10362; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm11[2,3,2,3] 10363; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload 10364; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] 10365; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 10366; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload 10367; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] 10368; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10369; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] 10370; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] 10371; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10372; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10373; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 10374; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 10375; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1,2],xmm9[3] 10376; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 10377; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] 10378; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10379; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm7[2,3,2,3] 10380; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 10381; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3] 10382; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 10383; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload 10384; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] 10385; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10386; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] 10387; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] 10388; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 10389; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 10390; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] 10391; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 10392; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload 10393; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3] 10394; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 10395; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] 10396; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10397; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 10398; AVX-NEXT: # xmm8 = mem[2,3,2,3] 10399; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload 10400; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] 10401; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 10402; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload 10403; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] 10404; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10405; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] 10406; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] 10407; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10408; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload 10409; AVX-NEXT: # ymm12 = ymm0[0],mem[0],ymm0[2],mem[2] 10410; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 10411; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload 10412; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3] 10413; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 10414; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] 10415; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10416; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm14[2,3,2,3] 10417; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload 10418; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] 10419; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 10420; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload 10421; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] 10422; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10423; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] 10424; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] 10425; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10426; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload 10427; AVX-NEXT: # ymm12 = ymm0[0],mem[0],ymm0[2],mem[2] 10428; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 10429; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload 10430; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3] 10431; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 10432; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] 10433; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10434; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm6[2,3,2,3] 10435; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload 10436; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] 10437; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 10438; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload 10439; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] 10440; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 10441; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm14[2,1],ymm12[2,0],ymm14[6,5],ymm12[6,4] 10442; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] 10443; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10444; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload 10445; AVX-NEXT: # ymm12 = ymm0[0],mem[0],ymm0[2],mem[2] 10446; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 10447; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload 10448; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3] 10449; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 10450; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] 10451; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10452; AVX-NEXT: vmovaps %xmm3, %xmm9 10453; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,3,2,3] 10454; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] 10455; AVX-NEXT: vmovaps %xmm4, %xmm6 10456; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10457; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 10458; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 10459; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm5[0,3],ymm12[7,5],ymm5[4,7] 10460; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 10461; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm4[2,1],ymm12[2,0],ymm4[6,5],ymm12[6,4] 10462; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] 10463; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 10464; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10465; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] 10466; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 10467; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload 10468; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3] 10469; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 10470; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] 10471; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10472; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 10473; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10474; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm12[0,0],ymm0[5,4],ymm12[4,4] 10475; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm12[3,1],ymm8[0,2],ymm12[7,5],ymm8[4,6] 10476; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10477; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload 10478; AVX-NEXT: # xmm12 = mem[0,1,2],xmm0[3] 10479; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3] 10480; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7] 10481; AVX-NEXT: vmovaps 416(%rdi), %ymm0 10482; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10483; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm10[1,3],ymm0[4,5],ymm10[5,7] 10484; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10485; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4] 10486; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] 10487; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10488; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 10489; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10490; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm12[0,0],ymm0[5,4],ymm12[4,4] 10491; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm12[3,1],ymm8[0,2],ymm12[7,5],ymm8[4,6] 10492; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10493; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload 10494; AVX-NEXT: # xmm12 = mem[0,1,2],xmm0[3] 10495; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3] 10496; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7] 10497; AVX-NEXT: vmovaps 864(%rdi), %ymm0 10498; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10499; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm2[1,3],ymm0[4,5],ymm2[5,7] 10500; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10501; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4] 10502; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm12[5,6,7] 10503; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10504; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10505; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10506; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm2[0,0],ymm0[5,4],ymm2[4,4] 10507; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm2[3,1],ymm8[0,2],ymm2[7,5],ymm8[4,6] 10508; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10509; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload 10510; AVX-NEXT: # xmm12 = mem[0,1,2],xmm0[3] 10511; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3] 10512; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7] 10513; AVX-NEXT: vmovaps 1312(%rdi), %ymm0 10514; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10515; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm1[1,3],ymm0[4,5],ymm1[5,7] 10516; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10517; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4] 10518; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm12[5,6,7] 10519; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10520; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10521; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10522; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm1[0,0],ymm0[5,4],ymm1[4,4] 10523; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm1[3,1],ymm8[0,2],ymm1[7,5],ymm8[4,6] 10524; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload 10525; AVX-NEXT: # xmm12 = xmm13[0,1,2],mem[3] 10526; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3] 10527; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7] 10528; AVX-NEXT: vmovaps 1760(%rdi), %ymm0 10529; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10530; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm11[1,3],ymm0[4,5],ymm11[5,7] 10531; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm15[0,2],ymm12[2,0],ymm15[4,6],ymm12[6,4] 10532; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm12[5,6,7] 10533; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10534; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm4[0,0],ymm5[5,4],ymm4[4,4] 10535; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm2[0,2],ymm4[7,5],ymm2[4,6] 10536; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm9[3] 10537; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] 10538; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] 10539; AVX-NEXT: vmovaps 1536(%rdi), %ymm0 10540; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10541; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,1],ymm3[1,3],ymm0[4,5],ymm3[5,7] 10542; AVX-NEXT: vmovaps %ymm3, %ymm15 10543; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,2],ymm4[2,0],ymm7[4,6],ymm4[6,4] 10544; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm4[5,6,7] 10545; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10546; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10547; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm14[0,0],ymm1[5,4],ymm14[4,4] 10548; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,1],ymm2[0,2],ymm14[7,5],ymm2[4,6] 10549; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 10550; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm4 # 16-byte Folded Reload 10551; AVX-NEXT: # xmm4 = xmm12[0,1,2],mem[3] 10552; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] 10553; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] 10554; AVX-NEXT: vmovaps 1088(%rdi), %ymm11 10555; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 10556; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm11[0,1],ymm10[1,3],ymm11[4,5],ymm10[5,7] 10557; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10558; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 10559; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm4[2,0],ymm14[4,6],ymm4[6,4] 10560; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm4[5,6,7] 10561; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10562; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10563; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10564; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] 10565; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] 10566; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 10567; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm4 # 16-byte Folded Reload 10568; AVX-NEXT: # xmm4 = xmm9[0,1,2],mem[3] 10569; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] 10570; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] 10571; AVX-NEXT: vmovaps 640(%rdi), %ymm8 10572; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 10573; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm5[1,3],ymm8[4,5],ymm5[5,7] 10574; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10575; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 10576; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm3[0,2],ymm4[2,0],ymm3[4,6],ymm4[6,4] 10577; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm4[5,6,7] 10578; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10579; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10580; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10581; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] 10582; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] 10583; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 10584; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload 10585; AVX-NEXT: # xmm4 = xmm6[0,1,2],mem[3] 10586; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] 10587; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] 10588; AVX-NEXT: vmovaps 192(%rdi), %ymm13 10589; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10590; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10591; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm13[0,1],ymm1[1,3],ymm13[4,5],ymm1[5,7] 10592; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10593; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,2],ymm7[2,0],ymm2[4,6],ymm7[6,4] 10594; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm7[5,6,7] 10595; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10596; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3,0,1] 10597; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm4[0,0],ymm2[7,4],ymm4[4,4] 10598; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,0],ymm1[2,0],ymm13[5,4],ymm1[6,4] 10599; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] 10600; AVX-NEXT: vmovaps 64(%rdi), %xmm2 10601; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10602; AVX-NEXT: vmovaps 96(%rdi), %xmm1 10603; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10604; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,1,0,1] 10605; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] 10606; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] 10607; AVX-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] 10608; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] 10609; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] 10610; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10611; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10612; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] 10613; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] 10614; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10615; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 10616; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,0],ymm7[2,0],ymm13[5,4],ymm7[6,4] 10617; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] 10618; AVX-NEXT: vmovaps 320(%rdi), %xmm1 10619; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10620; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,1,0,1] 10621; AVX-NEXT: vmovaps 288(%rdi), %xmm1 10622; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10623; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] 10624; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 10625; AVX-NEXT: # xmm6 = mem[2,3,2,3] 10626; AVX-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] 10627; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] 10628; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] 10629; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10630; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] 10631; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm0[0,0],ymm3[7,4],ymm0[4,4] 10632; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm5[2,0],ymm8[5,4],ymm5[6,4] 10633; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] 10634; AVX-NEXT: vmovaps 544(%rdi), %xmm1 10635; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10636; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] 10637; AVX-NEXT: vmovaps 512(%rdi), %xmm6 10638; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] 10639; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,3,2,3] 10640; AVX-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] 10641; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] 10642; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10643; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10644; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10645; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] 10646; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] 10647; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10648; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 10649; AVX-NEXT: # ymm1 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4] 10650; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] 10651; AVX-NEXT: vmovaps 768(%rdi), %xmm1 10652; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10653; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] 10654; AVX-NEXT: vmovaps 736(%rdi), %xmm4 10655; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] 10656; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 10657; AVX-NEXT: # xmm9 = mem[2,3,2,3] 10658; AVX-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3] 10659; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] 10660; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10661; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10662; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] 10663; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm0[0,0],ymm14[7,4],ymm0[4,4] 10664; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm10[2,0],ymm11[5,4],ymm10[6,4] 10665; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] 10666; AVX-NEXT: vmovaps 992(%rdi), %xmm1 10667; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10668; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] 10669; AVX-NEXT: vmovaps 960(%rdi), %xmm3 10670; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] 10671; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,3,2,3] 10672; AVX-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3] 10673; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] 10674; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10675; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10676; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10677; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] 10678; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] 10679; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 10680; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload 10681; AVX-NEXT: # ymm1 = ymm11[1,0],mem[2,0],ymm11[5,4],mem[6,4] 10682; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] 10683; AVX-NEXT: vmovaps 1216(%rdi), %xmm1 10684; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10685; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] 10686; AVX-NEXT: vmovaps 1184(%rdi), %xmm2 10687; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10688; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] 10689; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 10690; AVX-NEXT: # xmm14 = mem[2,3,2,3] 10691; AVX-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] 10692; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] 10693; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10694; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10695; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10696; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] 10697; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] 10698; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10699; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm15[2,0],ymm1[5,4],ymm15[6,4] 10700; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] 10701; AVX-NEXT: vmovaps 1440(%rdi), %xmm1 10702; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10703; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm1[0,1,0,1] 10704; AVX-NEXT: vmovaps 1408(%rdi), %xmm1 10705; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0,1,2],xmm14[3] 10706; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 10707; AVX-NEXT: # xmm10 = mem[2,3,2,3] 10708; AVX-NEXT: vblendps {{.*#+}} xmm10 = mem[0],xmm10[1],mem[2,3] 10709; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] 10710; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] 10711; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10712; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10713; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] 10714; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] 10715; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 10716; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 10717; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm15[2,0],ymm5[5,4],ymm15[6,4] 10718; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm10[2,0],ymm0[6,4],ymm10[6,4] 10719; AVX-NEXT: vmovaps 1664(%rdi), %xmm0 10720; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10721; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm0[0,1,0,1] 10722; AVX-NEXT: vmovaps 1632(%rdi), %xmm0 10723; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3] 10724; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 10725; AVX-NEXT: # xmm9 = mem[2,3,2,3] 10726; AVX-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3] 10727; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3] 10728; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm10[4,5,6,7] 10729; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10730; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10731; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload 10732; AVX-NEXT: # ymm9 = ymm2[2,1],mem[3,3],ymm2[6,5],mem[7,7] 10733; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 10734; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload 10735; AVX-NEXT: # xmm10 = mem[0],xmm2[1],mem[2,3] 10736; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 10737; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[1,0],ymm9[2,0],ymm10[5,4],ymm9[6,4] 10738; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 10739; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm8 # 16-byte Folded Reload 10740; AVX-NEXT: # xmm8 = xmm14[0,1,2],mem[3] 10741; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 10742; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10743; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,0],ymm12[1,0],ymm2[4,4],ymm12[5,4] 10744; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10 10745; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm10[2,0],xmm8[3,2] 10746; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] 10747; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10748; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm13[2,1],ymm7[3,3],ymm13[6,5],ymm7[7,7] 10749; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 10750; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload 10751; AVX-NEXT: # xmm9 = mem[0],xmm9[1],mem[2,3] 10752; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 10753; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,0],ymm8[2,0],ymm9[5,4],ymm8[6,4] 10754; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 10755; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload 10756; AVX-NEXT: # xmm7 = mem[0,1,2],xmm7[3] 10757; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 10758; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 10759; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,0],ymm13[1,0],ymm10[4,4],ymm13[5,4] 10760; AVX-NEXT: vextractf128 $1, %ymm9, %xmm9 10761; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm9[2,0],xmm7[3,2] 10762; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 10763; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10764; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10765; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload 10766; AVX-NEXT: # ymm7 = ymm7[2,1],mem[3,3],ymm7[6,5],mem[7,7] 10767; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 10768; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload 10769; AVX-NEXT: # xmm8 = mem[0],xmm8[1],mem[2,3] 10770; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 10771; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,0],ymm7[2,0],ymm8[5,4],ymm7[6,4] 10772; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload 10773; AVX-NEXT: # xmm6 = mem[0,1,2],xmm6[3] 10774; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 10775; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload 10776; AVX-NEXT: # ymm8 = ymm9[0,0],mem[1,0],ymm9[4,4],mem[5,4] 10777; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 10778; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,0],xmm6[3,2] 10779; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 10780; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10781; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 10782; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 10783; AVX-NEXT: # ymm6 = ymm6[2,1],mem[3,3],ymm6[6,5],mem[7,7] 10784; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 10785; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload 10786; AVX-NEXT: # xmm7 = mem[0],xmm7[1],mem[2,3] 10787; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 10788; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,0],ymm6[2,0],ymm7[5,4],ymm6[6,4] 10789; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 10790; AVX-NEXT: # xmm4 = mem[0,1,2],xmm4[3] 10791; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 10792; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload 10793; AVX-NEXT: # ymm7 = ymm8[0,0],mem[1,0],ymm8[4,4],mem[5,4] 10794; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7 10795; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm7[2,0],xmm4[3,2] 10796; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] 10797; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10798; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 10799; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 10800; AVX-NEXT: # ymm4 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7] 10801; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 10802; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload 10803; AVX-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3] 10804; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 10805; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] 10806; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 10807; AVX-NEXT: # xmm3 = mem[0,1,2],xmm3[3] 10808; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10809; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload 10810; AVX-NEXT: # ymm6 = ymm7[0,0],mem[1,0],ymm7[4,4],mem[5,4] 10811; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6 10812; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,0],xmm3[3,2] 10813; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] 10814; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10815; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload 10816; AVX-NEXT: # ymm3 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7] 10817; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 10818; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 10819; AVX-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3] 10820; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 10821; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4] 10822; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 10823; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload 10824; AVX-NEXT: # xmm4 = xmm11[0,1,2],mem[3] 10825; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 10826; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 10827; AVX-NEXT: # ymm6 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4] 10828; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6 10829; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,0],xmm4[3,2] 10830; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 10831; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10832; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 10833; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 10834; AVX-NEXT: # ymm3 = ymm3[2,1],mem[3,3],ymm3[6,5],mem[7,7] 10835; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 10836; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 10837; AVX-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3] 10838; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 10839; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4] 10840; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 10841; AVX-NEXT: # xmm1 = mem[0,1,2],xmm1[3] 10842; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 10843; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 10844; AVX-NEXT: # ymm4 = ymm4[0,0],mem[1,0],ymm4[4,4],mem[5,4] 10845; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 10846; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,0],xmm1[3,2] 10847; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 10848; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10849; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,1],ymm15[3,3],ymm5[6,5],ymm15[7,7] 10850; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 10851; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 10852; AVX-NEXT: # xmm3 = mem[0],xmm3[1],mem[2,3] 10853; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 10854; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm1[2,0],ymm3[5,4],ymm1[6,4] 10855; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 10856; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] 10857; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 10858; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload 10859; AVX-NEXT: # ymm3 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4] 10860; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 10861; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[3,2] 10862; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 10863; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10864; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 10865; AVX-NEXT: # xmm0 = mem[0,1,0,1] 10866; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3] 10867; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4] 10868; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 10869; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[2,3] 10870; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10871; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] 10872; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4] 10873; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 10874; AVX-NEXT: # xmm3 = mem[2,3,2,3] 10875; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 10876; AVX-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] 10877; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 10878; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] 10879; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] 10880; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 10881; AVX-NEXT: # xmm0 = mem[0,1,0,1] 10882; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 10883; AVX-NEXT: # xmm0 = xmm0[0,1,2],mem[3] 10884; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm10[1,0],ymm13[2,0],ymm10[5,4],ymm13[6,4] 10885; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 10886; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[2,3] 10887; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10888; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] 10889; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4] 10890; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 10891; AVX-NEXT: # xmm4 = mem[2,3,2,3] 10892; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 10893; AVX-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] 10894; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 10895; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] 10896; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm3[4,5,6,7] 10897; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 10898; AVX-NEXT: # xmm0 = mem[0,1,0,1] 10899; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 10900; AVX-NEXT: # xmm0 = xmm0[0,1,2],mem[3] 10901; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload 10902; AVX-NEXT: # ymm3 = ymm9[1,0],mem[2,0],ymm9[5,4],mem[6,4] 10903; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 10904; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[2,3] 10905; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 10906; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1] 10907; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[0,0],ymm4[7,4],ymm3[4,4] 10908; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 10909; AVX-NEXT: # xmm4 = mem[2,3,2,3] 10910; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 10911; AVX-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] 10912; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 10913; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] 10914; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 10915; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 10916; AVX-NEXT: # xmm3 = mem[0,1,0,1] 10917; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 10918; AVX-NEXT: # xmm3 = xmm3[0,1,2],mem[3] 10919; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload 10920; AVX-NEXT: # ymm4 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4] 10921; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 10922; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,3] 10923; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 10924; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3,0,1] 10925; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm5[3,0],ymm4[0,0],ymm5[7,4],ymm4[4,4] 10926; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 10927; AVX-NEXT: # xmm8 = mem[2,3,2,3] 10928; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload 10929; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] 10930; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 10931; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,0],ymm8[4,5],ymm4[6,4] 10932; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] 10933; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 10934; AVX-NEXT: # xmm4 = mem[0,1,0,1] 10935; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 10936; AVX-NEXT: # xmm4 = xmm4[0,1,2],mem[3] 10937; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload 10938; AVX-NEXT: # ymm8 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4] 10939; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 10940; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,0],xmm4[2,3] 10941; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 10942; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm5[2,3,0,1] 10943; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm5[3,0],ymm8[0,0],ymm5[7,4],ymm8[4,4] 10944; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 10945; AVX-NEXT: # xmm9 = mem[2,3,2,3] 10946; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload 10947; AVX-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] 10948; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 10949; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,0],ymm9[4,5],ymm8[6,4] 10950; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm8[4,5,6,7] 10951; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 10952; AVX-NEXT: # xmm4 = mem[0,1,0,1] 10953; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm11[3] 10954; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 10955; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload 10956; AVX-NEXT: # ymm9 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] 10957; AVX-NEXT: vextractf128 $1, %ymm9, %xmm9 10958; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,0],xmm4[2,3] 10959; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 10960; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm5[2,3,0,1] 10961; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm5[3,0],ymm9[0,0],ymm5[7,4],ymm9[4,4] 10962; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 10963; AVX-NEXT: # xmm10 = mem[2,3,2,3] 10964; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload 10965; AVX-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] 10966; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 10967; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4] 10968; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] 10969; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 10970; AVX-NEXT: # xmm9 = mem[0,1,0,1] 10971; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload 10972; AVX-NEXT: # xmm5 = xmm9[0,1,2],mem[3] 10973; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10974; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload 10975; AVX-NEXT: # ymm9 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4] 10976; AVX-NEXT: vextractf128 $1, %ymm9, %xmm9 10977; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,0],xmm5[2,3] 10978; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10979; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm7[2,3,0,1] 10980; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm7[3,0],ymm9[0,0],ymm7[7,4],ymm9[4,4] 10981; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 10982; AVX-NEXT: # xmm10 = mem[2,3,2,3] 10983; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload 10984; AVX-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] 10985; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 10986; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4] 10987; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] 10988; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 10989; AVX-NEXT: # xmm9 = mem[0,1,0,1] 10990; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3] 10991; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload 10992; AVX-NEXT: # ymm10 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] 10993; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10 10994; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm10[2,0],xmm9[2,3] 10995; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 10996; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm6[2,3,0,1] 10997; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm6[3,0],ymm10[0,0],ymm6[7,4],ymm10[4,4] 10998; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 10999; AVX-NEXT: # xmm12 = mem[2,3,2,3] 11000; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload 11001; AVX-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] 11002; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 11003; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,0],ymm12[4,5],ymm10[6,4] 11004; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] 11005; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11006; AVX-NEXT: vmovaps %ymm6, 192(%rsi) 11007; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11008; AVX-NEXT: vmovaps %ymm6, 128(%rsi) 11009; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11010; AVX-NEXT: vmovaps %ymm6, 64(%rsi) 11011; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11012; AVX-NEXT: vmovaps %ymm6, (%rsi) 11013; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11014; AVX-NEXT: vmovaps %ymm6, 224(%rsi) 11015; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 11016; AVX-NEXT: vmovaps %ymm10, 160(%rsi) 11017; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 11018; AVX-NEXT: vmovaps %ymm10, 96(%rsi) 11019; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 11020; AVX-NEXT: vmovaps %ymm10, 32(%rsi) 11021; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11022; AVX-NEXT: vmovaps %ymm6, 192(%rdx) 11023; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11024; AVX-NEXT: vmovaps %ymm6, 128(%rdx) 11025; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11026; AVX-NEXT: vmovaps %ymm6, 64(%rdx) 11027; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11028; AVX-NEXT: vmovaps %ymm6, (%rdx) 11029; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11030; AVX-NEXT: vmovaps %ymm6, 224(%rdx) 11031; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11032; AVX-NEXT: vmovaps %ymm6, 160(%rdx) 11033; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11034; AVX-NEXT: vmovaps %ymm6, 96(%rdx) 11035; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11036; AVX-NEXT: vmovaps %ymm6, 32(%rdx) 11037; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11038; AVX-NEXT: vmovaps %ymm6, 192(%rcx) 11039; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11040; AVX-NEXT: vmovaps %ymm6, 128(%rcx) 11041; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11042; AVX-NEXT: vmovaps %ymm6, 64(%rcx) 11043; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11044; AVX-NEXT: vmovaps %ymm6, (%rcx) 11045; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11046; AVX-NEXT: vmovaps %ymm6, 224(%rcx) 11047; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11048; AVX-NEXT: vmovaps %ymm6, 160(%rcx) 11049; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11050; AVX-NEXT: vmovaps %ymm6, 96(%rcx) 11051; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11052; AVX-NEXT: vmovaps %ymm6, 32(%rcx) 11053; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11054; AVX-NEXT: vmovaps %ymm6, (%r8) 11055; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11056; AVX-NEXT: vmovaps %ymm6, 64(%r8) 11057; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11058; AVX-NEXT: vmovaps %ymm6, 128(%r8) 11059; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11060; AVX-NEXT: vmovaps %ymm6, 192(%r8) 11061; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11062; AVX-NEXT: vmovaps %ymm6, 224(%r8) 11063; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11064; AVX-NEXT: vmovaps %ymm6, 160(%r8) 11065; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11066; AVX-NEXT: vmovaps %ymm6, 96(%r8) 11067; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11068; AVX-NEXT: vmovaps %ymm6, 32(%r8) 11069; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11070; AVX-NEXT: vmovaps %ymm6, 224(%r9) 11071; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11072; AVX-NEXT: vmovaps %ymm6, 192(%r9) 11073; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11074; AVX-NEXT: vmovaps %ymm6, 160(%r9) 11075; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11076; AVX-NEXT: vmovaps %ymm6, 128(%r9) 11077; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11078; AVX-NEXT: vmovaps %ymm6, 96(%r9) 11079; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11080; AVX-NEXT: vmovaps %ymm6, 64(%r9) 11081; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11082; AVX-NEXT: vmovaps %ymm6, 32(%r9) 11083; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11084; AVX-NEXT: vmovaps %ymm6, (%r9) 11085; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 11086; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11087; AVX-NEXT: vmovaps %ymm6, 224(%rax) 11088; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11089; AVX-NEXT: vmovaps %ymm6, 192(%rax) 11090; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11091; AVX-NEXT: vmovaps %ymm6, 160(%rax) 11092; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11093; AVX-NEXT: vmovaps %ymm6, 128(%rax) 11094; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11095; AVX-NEXT: vmovaps %ymm6, 96(%rax) 11096; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11097; AVX-NEXT: vmovaps %ymm6, 64(%rax) 11098; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11099; AVX-NEXT: vmovaps %ymm6, 32(%rax) 11100; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11101; AVX-NEXT: vmovaps %ymm6, (%rax) 11102; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 11103; AVX-NEXT: vmovaps %ymm9, 224(%rax) 11104; AVX-NEXT: vmovaps %ymm5, 192(%rax) 11105; AVX-NEXT: vmovaps %ymm4, 160(%rax) 11106; AVX-NEXT: vmovaps %ymm8, 128(%rax) 11107; AVX-NEXT: vmovaps %ymm3, 96(%rax) 11108; AVX-NEXT: vmovaps %ymm0, 64(%rax) 11109; AVX-NEXT: vmovaps %ymm1, 32(%rax) 11110; AVX-NEXT: vmovaps %ymm2, (%rax) 11111; AVX-NEXT: addq $3176, %rsp # imm = 0xC68 11112; AVX-NEXT: vzeroupper 11113; AVX-NEXT: retq 11114; 11115; AVX2-LABEL: load_i32_stride7_vf64: 11116; AVX2: # %bb.0: 11117; AVX2-NEXT: subq $2648, %rsp # imm = 0xA58 11118; AVX2-NEXT: vmovdqa 1216(%rdi), %ymm9 11119; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11120; AVX2-NEXT: vmovdqa 1152(%rdi), %ymm4 11121; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11122; AVX2-NEXT: vmovdqa 1120(%rdi), %ymm5 11123; AVX2-NEXT: vmovdqa 768(%rdi), %ymm12 11124; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11125; AVX2-NEXT: vmovdqa 704(%rdi), %ymm6 11126; AVX2-NEXT: vmovdqa 672(%rdi), %ymm7 11127; AVX2-NEXT: vmovdqa 320(%rdi), %ymm8 11128; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11129; AVX2-NEXT: vmovdqa 256(%rdi), %ymm10 11130; AVX2-NEXT: vmovdqa 224(%rdi), %ymm11 11131; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] 11132; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm10[6],ymm11[7] 11133; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11134; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11135; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 11136; AVX2-NEXT: vpbroadcastq 304(%rdi), %ymm2 11137; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] 11138; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 11139; AVX2-NEXT: vmovdqa 352(%rdi), %xmm2 11140; AVX2-NEXT: vmovdqa 384(%rdi), %xmm3 11141; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11142; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 11143; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 11144; AVX2-NEXT: vpbroadcastd 420(%rdi), %ymm3 11145; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 11146; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 11147; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11148; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] 11149; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11150; AVX2-NEXT: vmovdqa %ymm6, %ymm8 11151; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11152; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 11153; AVX2-NEXT: vpbroadcastq 752(%rdi), %ymm2 11154; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] 11155; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 11156; AVX2-NEXT: vmovdqa 800(%rdi), %xmm2 11157; AVX2-NEXT: vmovdqa 832(%rdi), %xmm3 11158; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11159; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 11160; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 11161; AVX2-NEXT: vpbroadcastd 868(%rdi), %ymm3 11162; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 11163; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 11164; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11165; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] 11166; AVX2-NEXT: vmovdqa %ymm5, %ymm6 11167; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11168; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 11169; AVX2-NEXT: vpbroadcastq 1200(%rdi), %ymm2 11170; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] 11171; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 11172; AVX2-NEXT: vmovdqa 1248(%rdi), %xmm2 11173; AVX2-NEXT: vmovdqa 1280(%rdi), %xmm3 11174; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11175; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 11176; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 11177; AVX2-NEXT: vpbroadcastd 1316(%rdi), %ymm3 11178; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 11179; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 11180; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11181; AVX2-NEXT: vmovdqa 1600(%rdi), %ymm13 11182; AVX2-NEXT: vmovdqa 1568(%rdi), %ymm5 11183; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm13[6],ymm5[7] 11184; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11185; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11186; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 11187; AVX2-NEXT: vmovdqa 1664(%rdi), %ymm3 11188; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11189; AVX2-NEXT: vpbroadcastq 1648(%rdi), %ymm2 11190; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 11191; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 11192; AVX2-NEXT: vmovdqa 1696(%rdi), %xmm2 11193; AVX2-NEXT: vmovdqa 1728(%rdi), %xmm3 11194; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11195; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 11196; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 11197; AVX2-NEXT: vpbroadcastd 1764(%rdi), %ymm3 11198; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 11199; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 11200; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11201; AVX2-NEXT: vmovdqa 96(%rdi), %ymm2 11202; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11203; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm1 11204; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 11205; AVX2-NEXT: vmovdqa (%rdi), %ymm2 11206; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11207; AVX2-NEXT: vmovdqa 32(%rdi), %ymm3 11208; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11209; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] 11210; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 11211; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 11212; AVX2-NEXT: vmovdqa 128(%rdi), %xmm2 11213; AVX2-NEXT: vmovdqa 160(%rdi), %xmm3 11214; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11215; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 11216; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 11217; AVX2-NEXT: vpbroadcastd 196(%rdi), %ymm3 11218; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 11219; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 11220; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11221; AVX2-NEXT: vmovdqa 480(%rdi), %ymm2 11222; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11223; AVX2-NEXT: vmovdqa 448(%rdi), %ymm1 11224; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11225; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] 11226; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 11227; AVX2-NEXT: vmovdqa 544(%rdi), %ymm3 11228; AVX2-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill 11229; AVX2-NEXT: vpbroadcastq 528(%rdi), %ymm2 11230; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 11231; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 11232; AVX2-NEXT: vmovdqa 576(%rdi), %xmm2 11233; AVX2-NEXT: vmovdqa 608(%rdi), %xmm3 11234; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11235; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 11236; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 11237; AVX2-NEXT: vpbroadcastd 644(%rdi), %ymm3 11238; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 11239; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 11240; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11241; AVX2-NEXT: vmovdqa 928(%rdi), %ymm2 11242; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11243; AVX2-NEXT: vmovdqa 896(%rdi), %ymm1 11244; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11245; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] 11246; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 11247; AVX2-NEXT: vmovdqa 992(%rdi), %ymm3 11248; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11249; AVX2-NEXT: vpbroadcastq 976(%rdi), %ymm2 11250; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 11251; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 11252; AVX2-NEXT: vmovdqa 1024(%rdi), %xmm2 11253; AVX2-NEXT: vmovdqa 1056(%rdi), %xmm3 11254; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11255; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 11256; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 11257; AVX2-NEXT: vpbroadcastd 1092(%rdi), %ymm3 11258; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 11259; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 11260; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11261; AVX2-NEXT: vmovdqa 1376(%rdi), %ymm14 11262; AVX2-NEXT: vmovdqa 1344(%rdi), %ymm15 11263; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7] 11264; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11265; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11266; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm0 11267; AVX2-NEXT: vmovdqa 1440(%rdi), %ymm4 11268; AVX2-NEXT: vpbroadcastq 1424(%rdi), %ymm1 11269; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] 11270; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11271; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 11272; AVX2-NEXT: vmovdqa 1472(%rdi), %xmm1 11273; AVX2-NEXT: vmovdqa 1504(%rdi), %xmm2 11274; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11275; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] 11276; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11277; AVX2-NEXT: vpbroadcastd 1540(%rdi), %ymm2 11278; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 11279; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 11280; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11281; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] 11282; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 11283; AVX2-NEXT: vmovdqa 384(%rdi), %ymm1 11284; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11285; AVX2-NEXT: vmovdqa 352(%rdi), %ymm0 11286; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11287; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27] 11288; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] 11289; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] 11290; AVX2-NEXT: vmovdqa 288(%rdi), %ymm12 11291; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 11292; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] 11293; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11294; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] 11295; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11296; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] 11297; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] 11298; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 11299; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 11300; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11301; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 11302; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11303; AVX2-NEXT: vmovdqa 832(%rdi), %ymm3 11304; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11305; AVX2-NEXT: vmovdqa 800(%rdi), %ymm2 11306; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11307; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 11308; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 11309; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 11310; AVX2-NEXT: vmovdqa 736(%rdi), %ymm2 11311; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11312; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 11313; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] 11314; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] 11315; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11316; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7] 11317; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 11318; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 11319; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11320; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 11321; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11322; AVX2-NEXT: vmovdqa 1280(%rdi), %ymm3 11323; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11324; AVX2-NEXT: vmovdqa 1248(%rdi), %ymm2 11325; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11326; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 11327; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 11328; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 11329; AVX2-NEXT: vmovdqa 1184(%rdi), %ymm2 11330; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11331; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 11332; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] 11333; AVX2-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload 11334; AVX2-NEXT: # ymm7 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7] 11335; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11336; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6],ymm7[7] 11337; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 11338; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 11339; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11340; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 11341; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11342; AVX2-NEXT: vmovdqa 1728(%rdi), %ymm3 11343; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11344; AVX2-NEXT: vmovdqa 1696(%rdi), %ymm2 11345; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11346; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 11347; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 11348; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 11349; AVX2-NEXT: vmovdqa 1632(%rdi), %ymm2 11350; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11351; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11352; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] 11353; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7] 11354; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11355; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6],ymm5[7] 11356; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 11357; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 11358; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11359; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 11360; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11361; AVX2-NEXT: vmovdqa 608(%rdi), %ymm3 11362; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11363; AVX2-NEXT: vmovdqa 576(%rdi), %ymm2 11364; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11365; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 11366; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 11367; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 11368; AVX2-NEXT: vmovdqa 512(%rdi), %ymm2 11369; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11370; AVX2-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload 11371; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7] 11372; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 11373; AVX2-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 11374; AVX2-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] 11375; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11376; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] 11377; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 11378; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 11379; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11380; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 11381; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11382; AVX2-NEXT: vmovdqa 1056(%rdi), %ymm3 11383; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11384; AVX2-NEXT: vmovdqa 1024(%rdi), %ymm2 11385; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11386; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 11387; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 11388; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 11389; AVX2-NEXT: vmovdqa 960(%rdi), %ymm2 11390; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11391; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 11392; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7] 11393; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 11394; AVX2-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 11395; AVX2-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] 11396; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11397; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] 11398; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 11399; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 11400; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11401; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 11402; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11403; AVX2-NEXT: vmovdqa 1504(%rdi), %ymm3 11404; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11405; AVX2-NEXT: vmovdqa 1472(%rdi), %ymm2 11406; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11407; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 11408; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 11409; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 11410; AVX2-NEXT: vmovdqa 1408(%rdi), %ymm2 11411; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11412; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] 11413; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] 11414; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11415; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] 11416; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 11417; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 11418; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11419; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 11420; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11421; AVX2-NEXT: vmovdqa 160(%rdi), %ymm15 11422; AVX2-NEXT: vmovdqa 128(%rdi), %ymm14 11423; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm15[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] 11424; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11425; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11426; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 11427; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 11428; AVX2-NEXT: vmovdqa 64(%rdi), %ymm4 11429; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 11430; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] 11431; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11432; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 11433; AVX2-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 11434; AVX2-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] 11435; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11436; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] 11437; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm0 11438; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 11439; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11440; AVX2-NEXT: vmovdqa 304(%rdi), %xmm0 11441; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] 11442; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 11443; AVX2-NEXT: vpbroadcastd 232(%rdi), %xmm1 11444; AVX2-NEXT: vmovdqa 256(%rdi), %xmm5 11445; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] 11446; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 11447; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 11448; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11449; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] 11450; AVX2-NEXT: vpbroadcastd 428(%rdi), %ymm2 11451; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 11452; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 11453; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11454; AVX2-NEXT: vmovdqa 752(%rdi), %xmm0 11455; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11456; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 11457; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 11458; AVX2-NEXT: vpbroadcastd 680(%rdi), %xmm1 11459; AVX2-NEXT: vmovdqa 704(%rdi), %xmm2 11460; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11461; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] 11462; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 11463; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 11464; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11465; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] 11466; AVX2-NEXT: vpbroadcastd 876(%rdi), %ymm2 11467; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 11468; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 11469; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11470; AVX2-NEXT: vmovdqa 1200(%rdi), %xmm0 11471; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11472; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 11473; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 11474; AVX2-NEXT: vpbroadcastd 1128(%rdi), %xmm1 11475; AVX2-NEXT: vmovdqa 1152(%rdi), %xmm2 11476; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11477; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] 11478; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 11479; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11480; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 11481; AVX2-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] 11482; AVX2-NEXT: vpbroadcastd 1324(%rdi), %ymm2 11483; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 11484; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 11485; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11486; AVX2-NEXT: vmovdqa 1648(%rdi), %xmm0 11487; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11488; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 11489; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 11490; AVX2-NEXT: vpbroadcastd 1576(%rdi), %xmm1 11491; AVX2-NEXT: vmovdqa 1600(%rdi), %xmm2 11492; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] 11493; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 11494; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11495; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 11496; AVX2-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] 11497; AVX2-NEXT: vpbroadcastd 1772(%rdi), %ymm6 11498; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] 11499; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 11500; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11501; AVX2-NEXT: vmovdqa 80(%rdi), %xmm0 11502; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] 11503; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 11504; AVX2-NEXT: vpbroadcastd 8(%rdi), %xmm1 11505; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4 11506; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11507; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] 11508; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 11509; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] 11510; AVX2-NEXT: vpbroadcastd 204(%rdi), %ymm6 11511; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] 11512; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 11513; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11514; AVX2-NEXT: vmovdqa 528(%rdi), %xmm0 11515; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11516; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 11517; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 11518; AVX2-NEXT: vpbroadcastd 456(%rdi), %xmm1 11519; AVX2-NEXT: vmovdqa 480(%rdi), %xmm4 11520; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11521; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] 11522; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 11523; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11524; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 11525; AVX2-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] 11526; AVX2-NEXT: vpbroadcastd 652(%rdi), %ymm15 11527; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] 11528; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 11529; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11530; AVX2-NEXT: vmovdqa 976(%rdi), %xmm0 11531; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11532; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 11533; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 11534; AVX2-NEXT: vpbroadcastd 904(%rdi), %xmm15 11535; AVX2-NEXT: vmovdqa 928(%rdi), %xmm11 11536; AVX2-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm11[1],xmm15[2,3] 11537; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] 11538; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 11539; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 11540; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] 11541; AVX2-NEXT: vpbroadcastd 1100(%rdi), %ymm14 11542; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] 11543; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] 11544; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11545; AVX2-NEXT: vmovdqa 1424(%rdi), %xmm0 11546; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 11547; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 11548; AVX2-NEXT: vpalignr {{.*#+}} ymm14 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] 11549; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] 11550; AVX2-NEXT: vpbroadcastd 1352(%rdi), %xmm15 11551; AVX2-NEXT: vmovdqa 1376(%rdi), %xmm0 11552; AVX2-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] 11553; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] 11554; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11555; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11556; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] 11557; AVX2-NEXT: vpbroadcastd 1548(%rdi), %ymm13 11558; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] 11559; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] 11560; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11561; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 11562; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload 11563; AVX2-NEXT: # ymm13 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] 11564; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] 11565; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3] 11566; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] 11567; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] 11568; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 11569; AVX2-NEXT: vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm12[1,3],ymm4[4,6],ymm12[5,7] 11570; AVX2-NEXT: vmovaps %ymm4, %ymm12 11571; AVX2-NEXT: vbroadcastss 432(%rdi), %ymm14 11572; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] 11573; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7] 11574; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11575; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 11576; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload 11577; AVX2-NEXT: # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] 11578; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 11579; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] 11580; AVX2-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] 11581; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] 11582; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] 11583; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 11584; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,2],ymm3[1,3],ymm14[4,6],ymm3[5,7] 11585; AVX2-NEXT: vbroadcastss 880(%rdi), %ymm13 11586; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7] 11587; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] 11588; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11589; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 11590; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload 11591; AVX2-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] 11592; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 11593; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] 11594; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] 11595; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] 11596; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] 11597; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 11598; AVX2-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 11599; AVX2-NEXT: # ymm4 = ymm4[0,2],mem[1,3],ymm4[4,6],mem[5,7] 11600; AVX2-NEXT: vbroadcastss 1328(%rdi), %ymm5 11601; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] 11602; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] 11603; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11604; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 11605; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 11606; AVX2-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] 11607; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] 11608; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] 11609; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] 11610; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] 11611; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 11612; AVX2-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload 11613; AVX2-NEXT: # ymm3 = ymm13[0,2],mem[1,3],ymm13[4,6],mem[5,7] 11614; AVX2-NEXT: vbroadcastss 1776(%rdi), %ymm4 11615; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] 11616; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] 11617; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11618; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7] 11619; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] 11620; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] 11621; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] 11622; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] 11623; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7] 11624; AVX2-NEXT: vmovaps %ymm1, %ymm9 11625; AVX2-NEXT: vbroadcastss 1552(%rdi), %ymm3 11626; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 11627; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 11628; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11629; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11630; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11631; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] 11632; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],mem[3] 11633; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] 11634; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 11635; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 11636; AVX2-NEXT: vmovdqa %ymm10, %ymm8 11637; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm7[1,3],ymm10[4,6],ymm7[5,7] 11638; AVX2-NEXT: vmovaps %ymm7, %ymm11 11639; AVX2-NEXT: vbroadcastss 1104(%rdi), %ymm2 11640; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 11641; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 11642; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11643; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 11644; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11645; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] 11646; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 11647; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] 11648; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] 11649; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 11650; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 11651; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 11652; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 11653; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm7[1,3],ymm4[4,6],ymm7[5,7] 11654; AVX2-NEXT: vbroadcastss 656(%rdi), %ymm2 11655; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 11656; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 11657; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11658; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11659; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11660; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] 11661; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 11662; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] 11663; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] 11664; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 11665; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 11666; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 11667; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 11668; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm10[1,3],ymm3[4,6],ymm10[5,7] 11669; AVX2-NEXT: vbroadcastss 208(%rdi), %ymm2 11670; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 11671; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 11672; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11673; AVX2-NEXT: vbroadcastss 100(%rdi), %xmm0 11674; AVX2-NEXT: vmovaps 64(%rdi), %xmm6 11675; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] 11676; AVX2-NEXT: vmovsd {{.*#+}} xmm5 = [4,3,0,0] 11677; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11678; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 11679; AVX2-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] 11680; AVX2-NEXT: vpermps %ymm1, %ymm5, %ymm1 11681; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 11682; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm15 = [0,7,0,7,0,7,0,7] 11683; AVX2-NEXT: vpermps %ymm3, %ymm15, %ymm1 11684; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] 11685; AVX2-NEXT: vbroadcastss 212(%rdi), %ymm2 11686; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 11687; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 11688; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11689; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11690; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11691; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] 11692; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm0 11693; AVX2-NEXT: vbroadcastss 324(%rdi), %xmm2 11694; AVX2-NEXT: vmovaps 288(%rdi), %xmm1 11695; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] 11696; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] 11697; AVX2-NEXT: vpermps %ymm12, %ymm15, %ymm2 11698; AVX2-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11699; AVX2-NEXT: # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] 11700; AVX2-NEXT: vbroadcastss 436(%rdi), %ymm3 11701; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 11702; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 11703; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11704; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11705; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11706; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] 11707; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm0 11708; AVX2-NEXT: vbroadcastss 548(%rdi), %xmm3 11709; AVX2-NEXT: vmovaps 512(%rdi), %xmm2 11710; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] 11711; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] 11712; AVX2-NEXT: vpermps %ymm4, %ymm15, %ymm3 11713; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] 11714; AVX2-NEXT: vbroadcastss 660(%rdi), %ymm4 11715; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] 11716; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 11717; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 11718; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11719; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11720; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] 11721; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm0 11722; AVX2-NEXT: vbroadcastss 772(%rdi), %xmm4 11723; AVX2-NEXT: vmovaps 736(%rdi), %xmm3 11724; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] 11725; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] 11726; AVX2-NEXT: vpermps %ymm14, %ymm15, %ymm4 11727; AVX2-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 11728; AVX2-NEXT: # ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] 11729; AVX2-NEXT: vbroadcastss 884(%rdi), %ymm7 11730; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] 11731; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] 11732; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11733; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11734; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11735; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] 11736; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm0 11737; AVX2-NEXT: vbroadcastss 996(%rdi), %xmm7 11738; AVX2-NEXT: vmovaps 960(%rdi), %xmm4 11739; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] 11740; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] 11741; AVX2-NEXT: vpermps %ymm8, %ymm15, %ymm7 11742; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] 11743; AVX2-NEXT: vbroadcastss 1108(%rdi), %ymm8 11744; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] 11745; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] 11746; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11747; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11748; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11749; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] 11750; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm0 11751; AVX2-NEXT: vbroadcastss 1220(%rdi), %xmm7 11752; AVX2-NEXT: vmovaps 1184(%rdi), %xmm14 11753; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] 11754; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] 11755; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 11756; AVX2-NEXT: vpermps %ymm10, %ymm15, %ymm7 11757; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 11758; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7] 11759; AVX2-NEXT: vbroadcastss 1332(%rdi), %ymm8 11760; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] 11761; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] 11762; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11763; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11764; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11765; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] 11766; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm7 11767; AVX2-NEXT: vbroadcastss 1444(%rdi), %xmm8 11768; AVX2-NEXT: vmovaps 1408(%rdi), %xmm0 11769; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3] 11770; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] 11771; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload 11772; AVX2-NEXT: vmovaps %ymm9, %ymm11 11773; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] 11774; AVX2-NEXT: vbroadcastss 1556(%rdi), %ymm9 11775; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] 11776; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 11777; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11778; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 11779; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload 11780; AVX2-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] 11781; AVX2-NEXT: vpermps %ymm7, %ymm5, %ymm7 11782; AVX2-NEXT: vbroadcastss 1668(%rdi), %xmm8 11783; AVX2-NEXT: vmovaps 1632(%rdi), %xmm5 11784; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] 11785; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] 11786; AVX2-NEXT: vpermps %ymm13, %ymm15, %ymm8 11787; AVX2-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload 11788; AVX2-NEXT: # ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] 11789; AVX2-NEXT: vbroadcastss 1780(%rdi), %ymm9 11790; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] 11791; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 11792; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11793; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 11794; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload 11795; AVX2-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] 11796; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] 11797; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] 11798; AVX2-NEXT: vbroadcastss 216(%rdi), %ymm8 11799; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] 11800; AVX2-NEXT: vmovaps 96(%rdi), %xmm9 11801; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] 11802; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] 11803; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload 11804; AVX2-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] 11805; AVX2-NEXT: vextractf128 $1, %ymm8, %xmm8 11806; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] 11807; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 11808; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11809; AVX2-NEXT: vmovaps 320(%rdi), %xmm13 11810; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] 11811; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] 11812; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 11813; AVX2-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] 11814; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 11815; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] 11816; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11817; AVX2-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 11818; AVX2-NEXT: # ymm6 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7] 11819; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] 11820; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] 11821; AVX2-NEXT: vbroadcastss 440(%rdi), %ymm7 11822; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] 11823; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] 11824; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11825; AVX2-NEXT: vmovaps 544(%rdi), %xmm8 11826; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm2[3] 11827; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] 11828; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 11829; AVX2-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] 11830; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 11831; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 11832; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11833; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11834; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] 11835; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] 11836; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 11837; AVX2-NEXT: vbroadcastss 664(%rdi), %ymm6 11838; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] 11839; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 11840; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11841; AVX2-NEXT: vmovaps 768(%rdi), %xmm1 11842; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3] 11843; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] 11844; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 11845; AVX2-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] 11846; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3 11847; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] 11848; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 11849; AVX2-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 11850; AVX2-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] 11851; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] 11852; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 11853; AVX2-NEXT: vbroadcastss 888(%rdi), %ymm6 11854; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] 11855; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 11856; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11857; AVX2-NEXT: vmovaps 992(%rdi), %xmm2 11858; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm4[3] 11859; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] 11860; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 11861; AVX2-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] 11862; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4 11863; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] 11864; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 11865; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 11866; AVX2-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] 11867; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] 11868; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] 11869; AVX2-NEXT: vbroadcastss 1112(%rdi), %ymm6 11870; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] 11871; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm4[4,5,6,7] 11872; AVX2-NEXT: vmovaps 1216(%rdi), %xmm3 11873; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm14[3] 11874; AVX2-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] 11875; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 11876; AVX2-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] 11877; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 11878; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] 11879; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7] 11880; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] 11881; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] 11882; AVX2-NEXT: vbroadcastss 1336(%rdi), %ymm10 11883; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] 11884; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] 11885; AVX2-NEXT: vmovaps 1440(%rdi), %xmm4 11886; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] 11887; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] 11888; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload 11889; AVX2-NEXT: # ymm10 = mem[1,0,2,3,5,4,6,7] 11890; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm10 11891; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] 11892; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload 11893; AVX2-NEXT: # ymm10 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] 11894; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0,3,3,5,4,7,7] 11895; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] 11896; AVX2-NEXT: vbroadcastss 1560(%rdi), %ymm12 11897; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] 11898; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm10[4,5,6,7] 11899; AVX2-NEXT: vmovaps 1664(%rdi), %xmm14 11900; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm5[3] 11901; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] 11902; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 11903; AVX2-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] 11904; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 11905; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] 11906; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 11907; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 11908; AVX2-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] 11909; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] 11910; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] 11911; AVX2-NEXT: vbroadcastss 1784(%rdi), %ymm12 11912; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm12[7] 11913; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm5[4,5,6,7] 11914; AVX2-NEXT: vbroadcastss 136(%rdi), %xmm0 11915; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 11916; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 11917; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 11918; AVX2-NEXT: vpermps 192(%rdi), %ymm15, %ymm5 11919; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] 11920; AVX2-NEXT: vbroadcastss 80(%rdi), %ymm5 11921; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3] 11922; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload 11923; AVX2-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] 11924; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 11925; AVX2-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] 11926; AVX2-NEXT: vextractf128 $1, %ymm11, %xmm11 11927; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] 11928; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm0[4,5,6,7] 11929; AVX2-NEXT: vbroadcastss 360(%rdi), %xmm0 11930; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 11931; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 11932; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 11933; AVX2-NEXT: vpermps 416(%rdi), %ymm15, %ymm5 11934; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] 11935; AVX2-NEXT: vbroadcastss 304(%rdi), %ymm5 11936; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm13[3] 11937; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload 11938; AVX2-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] 11939; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload 11940; AVX2-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] 11941; AVX2-NEXT: vextractf128 $1, %ymm13, %xmm13 11942; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3] 11943; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm0[4,5,6,7] 11944; AVX2-NEXT: vbroadcastss 584(%rdi), %xmm0 11945; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 11946; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 11947; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 11948; AVX2-NEXT: vpermps 640(%rdi), %ymm15, %ymm5 11949; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] 11950; AVX2-NEXT: vbroadcastss 528(%rdi), %ymm5 11951; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3] 11952; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload 11953; AVX2-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] 11954; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload 11955; AVX2-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] 11956; AVX2-NEXT: vextractf128 $1, %ymm8, %xmm8 11957; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] 11958; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm0[4,5,6,7] 11959; AVX2-NEXT: vbroadcastss 808(%rdi), %xmm0 11960; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 11961; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 11962; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 11963; AVX2-NEXT: vpermps 864(%rdi), %ymm15, %ymm5 11964; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] 11965; AVX2-NEXT: vbroadcastss 752(%rdi), %ymm5 11966; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] 11967; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 11968; AVX2-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] 11969; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 11970; AVX2-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] 11971; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 11972; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] 11973; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 11974; AVX2-NEXT: vbroadcastss 1032(%rdi), %xmm1 11975; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 11976; AVX2-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] 11977; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 11978; AVX2-NEXT: vpermps 1088(%rdi), %ymm15, %ymm5 11979; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] 11980; AVX2-NEXT: vbroadcastss 976(%rdi), %ymm5 11981; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] 11982; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 11983; AVX2-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] 11984; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 11985; AVX2-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] 11986; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 11987; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] 11988; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 11989; AVX2-NEXT: vbroadcastss 1256(%rdi), %xmm2 11990; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 11991; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 11992; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 11993; AVX2-NEXT: vpermps 1312(%rdi), %ymm15, %ymm5 11994; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] 11995; AVX2-NEXT: vbroadcastss 1200(%rdi), %ymm5 11996; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] 11997; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 11998; AVX2-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] 11999; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 12000; AVX2-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] 12001; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 12002; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] 12003; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 12004; AVX2-NEXT: vbroadcastss 1480(%rdi), %xmm3 12005; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 12006; AVX2-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] 12007; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 12008; AVX2-NEXT: vpermps 1536(%rdi), %ymm15, %ymm5 12009; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] 12010; AVX2-NEXT: vbroadcastss 1424(%rdi), %ymm5 12011; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] 12012; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 12013; AVX2-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] 12014; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 12015; AVX2-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] 12016; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 12017; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] 12018; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 12019; AVX2-NEXT: vbroadcastss 1704(%rdi), %xmm4 12020; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 12021; AVX2-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] 12022; AVX2-NEXT: vpermps 1760(%rdi), %ymm15, %ymm5 12023; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 12024; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 12025; AVX2-NEXT: vbroadcastss 1648(%rdi), %ymm5 12026; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm14[3] 12027; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload 12028; AVX2-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] 12029; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload 12030; AVX2-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] 12031; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm14 12032; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3] 12033; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 12034; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12035; AVX2-NEXT: vmovaps %ymm5, 192(%rsi) 12036; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12037; AVX2-NEXT: vmovaps %ymm5, 128(%rsi) 12038; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12039; AVX2-NEXT: vmovaps %ymm5, 64(%rsi) 12040; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12041; AVX2-NEXT: vmovaps %ymm5, (%rsi) 12042; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12043; AVX2-NEXT: vmovaps %ymm5, 224(%rsi) 12044; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12045; AVX2-NEXT: vmovaps %ymm5, 160(%rsi) 12046; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12047; AVX2-NEXT: vmovaps %ymm5, 96(%rsi) 12048; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12049; AVX2-NEXT: vmovaps %ymm5, 32(%rsi) 12050; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12051; AVX2-NEXT: vmovaps %ymm5, 192(%rdx) 12052; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12053; AVX2-NEXT: vmovaps %ymm5, 128(%rdx) 12054; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12055; AVX2-NEXT: vmovaps %ymm5, 64(%rdx) 12056; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12057; AVX2-NEXT: vmovaps %ymm5, (%rdx) 12058; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12059; AVX2-NEXT: vmovaps %ymm5, 224(%rdx) 12060; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12061; AVX2-NEXT: vmovaps %ymm5, 160(%rdx) 12062; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12063; AVX2-NEXT: vmovaps %ymm5, 96(%rdx) 12064; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12065; AVX2-NEXT: vmovaps %ymm5, 32(%rdx) 12066; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12067; AVX2-NEXT: vmovaps %ymm5, 192(%rcx) 12068; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12069; AVX2-NEXT: vmovaps %ymm5, 128(%rcx) 12070; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12071; AVX2-NEXT: vmovaps %ymm5, 64(%rcx) 12072; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12073; AVX2-NEXT: vmovaps %ymm5, (%rcx) 12074; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12075; AVX2-NEXT: vmovaps %ymm5, 224(%rcx) 12076; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12077; AVX2-NEXT: vmovaps %ymm5, 160(%rcx) 12078; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12079; AVX2-NEXT: vmovaps %ymm5, 96(%rcx) 12080; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12081; AVX2-NEXT: vmovaps %ymm5, 32(%rcx) 12082; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12083; AVX2-NEXT: vmovaps %ymm5, (%r8) 12084; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12085; AVX2-NEXT: vmovaps %ymm5, 64(%r8) 12086; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12087; AVX2-NEXT: vmovaps %ymm5, 128(%r8) 12088; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12089; AVX2-NEXT: vmovaps %ymm5, 192(%r8) 12090; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12091; AVX2-NEXT: vmovaps %ymm5, 224(%r8) 12092; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12093; AVX2-NEXT: vmovaps %ymm5, 160(%r8) 12094; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12095; AVX2-NEXT: vmovaps %ymm5, 96(%r8) 12096; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12097; AVX2-NEXT: vmovaps %ymm5, 32(%r8) 12098; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12099; AVX2-NEXT: vmovaps %ymm5, 224(%r9) 12100; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12101; AVX2-NEXT: vmovaps %ymm5, 192(%r9) 12102; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12103; AVX2-NEXT: vmovaps %ymm5, 160(%r9) 12104; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12105; AVX2-NEXT: vmovaps %ymm5, 128(%r9) 12106; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12107; AVX2-NEXT: vmovaps %ymm5, 96(%r9) 12108; AVX2-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload 12109; AVX2-NEXT: vmovaps %ymm5, 64(%r9) 12110; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12111; AVX2-NEXT: vmovaps %ymm5, 32(%r9) 12112; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12113; AVX2-NEXT: vmovaps %ymm5, (%r9) 12114; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 12115; AVX2-NEXT: vmovaps %ymm12, 224(%rax) 12116; AVX2-NEXT: vmovaps %ymm10, 192(%rax) 12117; AVX2-NEXT: vmovaps %ymm6, 160(%rax) 12118; AVX2-NEXT: vmovaps %ymm7, 128(%rax) 12119; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12120; AVX2-NEXT: vmovaps %ymm5, 96(%rax) 12121; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12122; AVX2-NEXT: vmovaps %ymm5, 64(%rax) 12123; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12124; AVX2-NEXT: vmovaps %ymm5, 32(%rax) 12125; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12126; AVX2-NEXT: vmovaps %ymm5, (%rax) 12127; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 12128; AVX2-NEXT: vmovaps %ymm4, 224(%rax) 12129; AVX2-NEXT: vmovaps %ymm3, 192(%rax) 12130; AVX2-NEXT: vmovaps %ymm2, 160(%rax) 12131; AVX2-NEXT: vmovaps %ymm1, 128(%rax) 12132; AVX2-NEXT: vmovaps %ymm0, 96(%rax) 12133; AVX2-NEXT: vmovaps %ymm8, 64(%rax) 12134; AVX2-NEXT: vmovaps %ymm13, 32(%rax) 12135; AVX2-NEXT: vmovaps %ymm11, (%rax) 12136; AVX2-NEXT: addq $2648, %rsp # imm = 0xA58 12137; AVX2-NEXT: vzeroupper 12138; AVX2-NEXT: retq 12139; 12140; AVX2-FP-LABEL: load_i32_stride7_vf64: 12141; AVX2-FP: # %bb.0: 12142; AVX2-FP-NEXT: subq $2648, %rsp # imm = 0xA58 12143; AVX2-FP-NEXT: vmovdqa 1216(%rdi), %ymm9 12144; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12145; AVX2-FP-NEXT: vmovdqa 1152(%rdi), %ymm4 12146; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12147; AVX2-FP-NEXT: vmovdqa 1120(%rdi), %ymm5 12148; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm12 12149; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12150; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm6 12151; AVX2-FP-NEXT: vmovdqa 672(%rdi), %ymm7 12152; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm8 12153; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12154; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm10 12155; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm11 12156; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] 12157; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm10[6],ymm11[7] 12158; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12159; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12160; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 12161; AVX2-FP-NEXT: vpbroadcastq 304(%rdi), %ymm2 12162; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] 12163; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 12164; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm2 12165; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm3 12166; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12167; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 12168; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 12169; AVX2-FP-NEXT: vpbroadcastd 420(%rdi), %ymm3 12170; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 12171; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 12172; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12173; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] 12174; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12175; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm8 12176; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12177; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 12178; AVX2-FP-NEXT: vpbroadcastq 752(%rdi), %ymm2 12179; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] 12180; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 12181; AVX2-FP-NEXT: vmovdqa 800(%rdi), %xmm2 12182; AVX2-FP-NEXT: vmovdqa 832(%rdi), %xmm3 12183; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12184; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 12185; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 12186; AVX2-FP-NEXT: vpbroadcastd 868(%rdi), %ymm3 12187; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 12188; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 12189; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12190; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] 12191; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm6 12192; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12193; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 12194; AVX2-FP-NEXT: vpbroadcastq 1200(%rdi), %ymm2 12195; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] 12196; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 12197; AVX2-FP-NEXT: vmovdqa 1248(%rdi), %xmm2 12198; AVX2-FP-NEXT: vmovdqa 1280(%rdi), %xmm3 12199; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12200; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 12201; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 12202; AVX2-FP-NEXT: vpbroadcastd 1316(%rdi), %ymm3 12203; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 12204; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 12205; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12206; AVX2-FP-NEXT: vmovdqa 1600(%rdi), %ymm13 12207; AVX2-FP-NEXT: vmovdqa 1568(%rdi), %ymm5 12208; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm13[6],ymm5[7] 12209; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12210; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12211; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 12212; AVX2-FP-NEXT: vmovdqa 1664(%rdi), %ymm3 12213; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12214; AVX2-FP-NEXT: vpbroadcastq 1648(%rdi), %ymm2 12215; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 12216; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 12217; AVX2-FP-NEXT: vmovdqa 1696(%rdi), %xmm2 12218; AVX2-FP-NEXT: vmovdqa 1728(%rdi), %xmm3 12219; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12220; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 12221; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 12222; AVX2-FP-NEXT: vpbroadcastd 1764(%rdi), %ymm3 12223; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 12224; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 12225; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12226; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm2 12227; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12228; AVX2-FP-NEXT: vpbroadcastq 80(%rdi), %ymm1 12229; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 12230; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 12231; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12232; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm3 12233; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12234; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] 12235; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 12236; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 12237; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm2 12238; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm3 12239; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12240; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 12241; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 12242; AVX2-FP-NEXT: vpbroadcastd 196(%rdi), %ymm3 12243; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 12244; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 12245; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12246; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm2 12247; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12248; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm1 12249; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12250; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] 12251; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 12252; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm3 12253; AVX2-FP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill 12254; AVX2-FP-NEXT: vpbroadcastq 528(%rdi), %ymm2 12255; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 12256; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 12257; AVX2-FP-NEXT: vmovdqa 576(%rdi), %xmm2 12258; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm3 12259; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12260; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 12261; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 12262; AVX2-FP-NEXT: vpbroadcastd 644(%rdi), %ymm3 12263; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 12264; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 12265; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12266; AVX2-FP-NEXT: vmovdqa 928(%rdi), %ymm2 12267; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12268; AVX2-FP-NEXT: vmovdqa 896(%rdi), %ymm1 12269; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12270; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] 12271; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 12272; AVX2-FP-NEXT: vmovdqa 992(%rdi), %ymm3 12273; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12274; AVX2-FP-NEXT: vpbroadcastq 976(%rdi), %ymm2 12275; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 12276; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 12277; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %xmm2 12278; AVX2-FP-NEXT: vmovdqa 1056(%rdi), %xmm3 12279; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12280; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 12281; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 12282; AVX2-FP-NEXT: vpbroadcastd 1092(%rdi), %ymm3 12283; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 12284; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 12285; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12286; AVX2-FP-NEXT: vmovdqa 1376(%rdi), %ymm14 12287; AVX2-FP-NEXT: vmovdqa 1344(%rdi), %ymm15 12288; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7] 12289; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12290; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12291; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm0 12292; AVX2-FP-NEXT: vmovdqa 1440(%rdi), %ymm4 12293; AVX2-FP-NEXT: vpbroadcastq 1424(%rdi), %ymm1 12294; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] 12295; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12296; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 12297; AVX2-FP-NEXT: vmovdqa 1472(%rdi), %xmm1 12298; AVX2-FP-NEXT: vmovdqa 1504(%rdi), %xmm2 12299; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12300; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] 12301; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12302; AVX2-FP-NEXT: vpbroadcastd 1540(%rdi), %ymm2 12303; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 12304; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 12305; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12306; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] 12307; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 12308; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm1 12309; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12310; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm0 12311; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12312; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27] 12313; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] 12314; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] 12315; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm12 12316; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 12317; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] 12318; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12319; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] 12320; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12321; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] 12322; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] 12323; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 12324; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 12325; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12326; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 12327; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12328; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm3 12329; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12330; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm2 12331; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12332; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 12333; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 12334; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 12335; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm2 12336; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12337; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 12338; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] 12339; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] 12340; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12341; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7] 12342; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 12343; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 12344; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12345; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 12346; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12347; AVX2-FP-NEXT: vmovdqa 1280(%rdi), %ymm3 12348; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12349; AVX2-FP-NEXT: vmovdqa 1248(%rdi), %ymm2 12350; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12351; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 12352; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 12353; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 12354; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %ymm2 12355; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12356; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 12357; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] 12358; AVX2-FP-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload 12359; AVX2-FP-NEXT: # ymm7 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7] 12360; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12361; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6],ymm7[7] 12362; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 12363; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 12364; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12365; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 12366; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12367; AVX2-FP-NEXT: vmovdqa 1728(%rdi), %ymm3 12368; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12369; AVX2-FP-NEXT: vmovdqa 1696(%rdi), %ymm2 12370; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12371; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 12372; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 12373; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 12374; AVX2-FP-NEXT: vmovdqa 1632(%rdi), %ymm2 12375; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12376; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 12377; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] 12378; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7] 12379; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12380; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6],ymm5[7] 12381; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 12382; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 12383; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12384; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 12385; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12386; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm3 12387; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12388; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm2 12389; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12390; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 12391; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 12392; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 12393; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm2 12394; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12395; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload 12396; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7] 12397; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 12398; AVX2-FP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 12399; AVX2-FP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] 12400; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12401; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] 12402; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 12403; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 12404; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12405; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 12406; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12407; AVX2-FP-NEXT: vmovdqa 1056(%rdi), %ymm3 12408; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12409; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %ymm2 12410; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12411; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 12412; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 12413; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 12414; AVX2-FP-NEXT: vmovdqa 960(%rdi), %ymm2 12415; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12416; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 12417; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7] 12418; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 12419; AVX2-FP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 12420; AVX2-FP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] 12421; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12422; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] 12423; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 12424; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 12425; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12426; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 12427; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12428; AVX2-FP-NEXT: vmovdqa 1504(%rdi), %ymm3 12429; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12430; AVX2-FP-NEXT: vmovdqa 1472(%rdi), %ymm2 12431; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12432; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 12433; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 12434; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 12435; AVX2-FP-NEXT: vmovdqa 1408(%rdi), %ymm2 12436; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12437; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] 12438; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] 12439; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12440; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] 12441; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 12442; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 12443; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12444; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 12445; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12446; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm15 12447; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm14 12448; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm15[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] 12449; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12450; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12451; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 12452; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 12453; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm4 12454; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 12455; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] 12456; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12457; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 12458; AVX2-FP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 12459; AVX2-FP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] 12460; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12461; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] 12462; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm0 12463; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 12464; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12465; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm0 12466; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] 12467; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 12468; AVX2-FP-NEXT: vpbroadcastd 232(%rdi), %xmm1 12469; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm5 12470; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] 12471; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 12472; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 12473; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12474; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] 12475; AVX2-FP-NEXT: vpbroadcastd 428(%rdi), %ymm2 12476; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 12477; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 12478; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12479; AVX2-FP-NEXT: vmovdqa 752(%rdi), %xmm0 12480; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12481; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 12482; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 12483; AVX2-FP-NEXT: vpbroadcastd 680(%rdi), %xmm1 12484; AVX2-FP-NEXT: vmovdqa 704(%rdi), %xmm2 12485; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12486; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] 12487; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 12488; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 12489; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12490; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] 12491; AVX2-FP-NEXT: vpbroadcastd 876(%rdi), %ymm2 12492; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 12493; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 12494; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12495; AVX2-FP-NEXT: vmovdqa 1200(%rdi), %xmm0 12496; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12497; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 12498; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 12499; AVX2-FP-NEXT: vpbroadcastd 1128(%rdi), %xmm1 12500; AVX2-FP-NEXT: vmovdqa 1152(%rdi), %xmm2 12501; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12502; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] 12503; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 12504; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12505; AVX2-FP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 12506; AVX2-FP-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] 12507; AVX2-FP-NEXT: vpbroadcastd 1324(%rdi), %ymm2 12508; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 12509; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 12510; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12511; AVX2-FP-NEXT: vmovdqa 1648(%rdi), %xmm0 12512; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12513; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 12514; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 12515; AVX2-FP-NEXT: vpbroadcastd 1576(%rdi), %xmm1 12516; AVX2-FP-NEXT: vmovdqa 1600(%rdi), %xmm2 12517; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] 12518; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 12519; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12520; AVX2-FP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 12521; AVX2-FP-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] 12522; AVX2-FP-NEXT: vpbroadcastd 1772(%rdi), %ymm6 12523; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] 12524; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 12525; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12526; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm0 12527; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] 12528; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 12529; AVX2-FP-NEXT: vpbroadcastd 8(%rdi), %xmm1 12530; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4 12531; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12532; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] 12533; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 12534; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] 12535; AVX2-FP-NEXT: vpbroadcastd 204(%rdi), %ymm6 12536; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] 12537; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 12538; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12539; AVX2-FP-NEXT: vmovdqa 528(%rdi), %xmm0 12540; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12541; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 12542; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 12543; AVX2-FP-NEXT: vpbroadcastd 456(%rdi), %xmm1 12544; AVX2-FP-NEXT: vmovdqa 480(%rdi), %xmm4 12545; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12546; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] 12547; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 12548; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12549; AVX2-FP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 12550; AVX2-FP-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] 12551; AVX2-FP-NEXT: vpbroadcastd 652(%rdi), %ymm15 12552; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] 12553; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 12554; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12555; AVX2-FP-NEXT: vmovdqa 976(%rdi), %xmm0 12556; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12557; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 12558; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 12559; AVX2-FP-NEXT: vpbroadcastd 904(%rdi), %xmm15 12560; AVX2-FP-NEXT: vmovdqa 928(%rdi), %xmm11 12561; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm11[1],xmm15[2,3] 12562; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] 12563; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 12564; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 12565; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] 12566; AVX2-FP-NEXT: vpbroadcastd 1100(%rdi), %ymm14 12567; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] 12568; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] 12569; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12570; AVX2-FP-NEXT: vmovdqa 1424(%rdi), %xmm0 12571; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 12572; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 12573; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm14 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] 12574; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] 12575; AVX2-FP-NEXT: vpbroadcastd 1352(%rdi), %xmm15 12576; AVX2-FP-NEXT: vmovdqa 1376(%rdi), %xmm0 12577; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] 12578; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] 12579; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 12580; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12581; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] 12582; AVX2-FP-NEXT: vpbroadcastd 1548(%rdi), %ymm13 12583; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] 12584; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] 12585; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12586; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 12587; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload 12588; AVX2-FP-NEXT: # ymm13 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] 12589; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] 12590; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3] 12591; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] 12592; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] 12593; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 12594; AVX2-FP-NEXT: vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm12[1,3],ymm4[4,6],ymm12[5,7] 12595; AVX2-FP-NEXT: vmovaps %ymm4, %ymm12 12596; AVX2-FP-NEXT: vbroadcastss 432(%rdi), %ymm14 12597; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] 12598; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7] 12599; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12600; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 12601; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload 12602; AVX2-FP-NEXT: # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] 12603; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 12604; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] 12605; AVX2-FP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] 12606; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] 12607; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] 12608; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 12609; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,2],ymm3[1,3],ymm14[4,6],ymm3[5,7] 12610; AVX2-FP-NEXT: vbroadcastss 880(%rdi), %ymm13 12611; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7] 12612; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] 12613; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12614; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 12615; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload 12616; AVX2-FP-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] 12617; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 12618; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] 12619; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] 12620; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] 12621; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] 12622; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 12623; AVX2-FP-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 12624; AVX2-FP-NEXT: # ymm4 = ymm4[0,2],mem[1,3],ymm4[4,6],mem[5,7] 12625; AVX2-FP-NEXT: vbroadcastss 1328(%rdi), %ymm5 12626; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] 12627; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] 12628; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12629; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 12630; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 12631; AVX2-FP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] 12632; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] 12633; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] 12634; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] 12635; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] 12636; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 12637; AVX2-FP-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload 12638; AVX2-FP-NEXT: # ymm3 = ymm13[0,2],mem[1,3],ymm13[4,6],mem[5,7] 12639; AVX2-FP-NEXT: vbroadcastss 1776(%rdi), %ymm4 12640; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] 12641; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] 12642; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12643; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7] 12644; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] 12645; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] 12646; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] 12647; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] 12648; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7] 12649; AVX2-FP-NEXT: vmovaps %ymm1, %ymm9 12650; AVX2-FP-NEXT: vbroadcastss 1552(%rdi), %ymm3 12651; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 12652; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 12653; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12654; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12655; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12656; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] 12657; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],mem[3] 12658; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] 12659; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 12660; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 12661; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm8 12662; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm7[1,3],ymm10[4,6],ymm7[5,7] 12663; AVX2-FP-NEXT: vmovaps %ymm7, %ymm11 12664; AVX2-FP-NEXT: vbroadcastss 1104(%rdi), %ymm2 12665; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 12666; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 12667; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12668; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 12669; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12670; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] 12671; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 12672; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] 12673; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] 12674; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 12675; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 12676; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 12677; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 12678; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm7[1,3],ymm4[4,6],ymm7[5,7] 12679; AVX2-FP-NEXT: vbroadcastss 656(%rdi), %ymm2 12680; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 12681; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 12682; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12683; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12684; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12685; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] 12686; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 12687; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] 12688; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] 12689; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 12690; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 12691; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 12692; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 12693; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm10[1,3],ymm3[4,6],ymm10[5,7] 12694; AVX2-FP-NEXT: vbroadcastss 208(%rdi), %ymm2 12695; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 12696; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 12697; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12698; AVX2-FP-NEXT: vbroadcastss 100(%rdi), %xmm0 12699; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm6 12700; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] 12701; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm5 = [4,3,0,0] 12702; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12703; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 12704; AVX2-FP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] 12705; AVX2-FP-NEXT: vpermps %ymm1, %ymm5, %ymm1 12706; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 12707; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm15 = [0,7,0,7,0,7,0,7] 12708; AVX2-FP-NEXT: vpermps %ymm3, %ymm15, %ymm1 12709; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] 12710; AVX2-FP-NEXT: vbroadcastss 212(%rdi), %ymm2 12711; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 12712; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 12713; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12714; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12715; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12716; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] 12717; AVX2-FP-NEXT: vpermps %ymm0, %ymm5, %ymm0 12718; AVX2-FP-NEXT: vbroadcastss 324(%rdi), %xmm2 12719; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm1 12720; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] 12721; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] 12722; AVX2-FP-NEXT: vpermps %ymm12, %ymm15, %ymm2 12723; AVX2-FP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 12724; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] 12725; AVX2-FP-NEXT: vbroadcastss 436(%rdi), %ymm3 12726; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 12727; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 12728; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12729; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12730; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12731; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] 12732; AVX2-FP-NEXT: vpermps %ymm0, %ymm5, %ymm0 12733; AVX2-FP-NEXT: vbroadcastss 548(%rdi), %xmm3 12734; AVX2-FP-NEXT: vmovaps 512(%rdi), %xmm2 12735; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] 12736; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] 12737; AVX2-FP-NEXT: vpermps %ymm4, %ymm15, %ymm3 12738; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] 12739; AVX2-FP-NEXT: vbroadcastss 660(%rdi), %ymm4 12740; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] 12741; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 12742; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 12743; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12744; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12745; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] 12746; AVX2-FP-NEXT: vpermps %ymm0, %ymm5, %ymm0 12747; AVX2-FP-NEXT: vbroadcastss 772(%rdi), %xmm4 12748; AVX2-FP-NEXT: vmovaps 736(%rdi), %xmm3 12749; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] 12750; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] 12751; AVX2-FP-NEXT: vpermps %ymm14, %ymm15, %ymm4 12752; AVX2-FP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 12753; AVX2-FP-NEXT: # ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] 12754; AVX2-FP-NEXT: vbroadcastss 884(%rdi), %ymm7 12755; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] 12756; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] 12757; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12758; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12759; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12760; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] 12761; AVX2-FP-NEXT: vpermps %ymm0, %ymm5, %ymm0 12762; AVX2-FP-NEXT: vbroadcastss 996(%rdi), %xmm7 12763; AVX2-FP-NEXT: vmovaps 960(%rdi), %xmm4 12764; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] 12765; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] 12766; AVX2-FP-NEXT: vpermps %ymm8, %ymm15, %ymm7 12767; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] 12768; AVX2-FP-NEXT: vbroadcastss 1108(%rdi), %ymm8 12769; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] 12770; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] 12771; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12772; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12773; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12774; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] 12775; AVX2-FP-NEXT: vpermps %ymm0, %ymm5, %ymm0 12776; AVX2-FP-NEXT: vbroadcastss 1220(%rdi), %xmm7 12777; AVX2-FP-NEXT: vmovaps 1184(%rdi), %xmm14 12778; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] 12779; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] 12780; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 12781; AVX2-FP-NEXT: vpermps %ymm10, %ymm15, %ymm7 12782; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 12783; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7] 12784; AVX2-FP-NEXT: vbroadcastss 1332(%rdi), %ymm8 12785; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] 12786; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] 12787; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12788; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12789; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12790; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] 12791; AVX2-FP-NEXT: vpermps %ymm0, %ymm5, %ymm7 12792; AVX2-FP-NEXT: vbroadcastss 1444(%rdi), %xmm8 12793; AVX2-FP-NEXT: vmovaps 1408(%rdi), %xmm0 12794; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3] 12795; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] 12796; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload 12797; AVX2-FP-NEXT: vmovaps %ymm9, %ymm11 12798; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] 12799; AVX2-FP-NEXT: vbroadcastss 1556(%rdi), %ymm9 12800; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] 12801; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 12802; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12803; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 12804; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload 12805; AVX2-FP-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] 12806; AVX2-FP-NEXT: vpermps %ymm7, %ymm5, %ymm7 12807; AVX2-FP-NEXT: vbroadcastss 1668(%rdi), %xmm8 12808; AVX2-FP-NEXT: vmovaps 1632(%rdi), %xmm5 12809; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] 12810; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] 12811; AVX2-FP-NEXT: vpermps %ymm13, %ymm15, %ymm8 12812; AVX2-FP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload 12813; AVX2-FP-NEXT: # ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] 12814; AVX2-FP-NEXT: vbroadcastss 1780(%rdi), %ymm9 12815; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] 12816; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 12817; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12818; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 12819; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload 12820; AVX2-FP-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] 12821; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] 12822; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] 12823; AVX2-FP-NEXT: vbroadcastss 216(%rdi), %ymm8 12824; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] 12825; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm9 12826; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] 12827; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] 12828; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload 12829; AVX2-FP-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] 12830; AVX2-FP-NEXT: vextractf128 $1, %ymm8, %xmm8 12831; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] 12832; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 12833; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12834; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm13 12835; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] 12836; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] 12837; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 12838; AVX2-FP-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] 12839; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 12840; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] 12841; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 12842; AVX2-FP-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 12843; AVX2-FP-NEXT: # ymm6 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7] 12844; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] 12845; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] 12846; AVX2-FP-NEXT: vbroadcastss 440(%rdi), %ymm7 12847; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] 12848; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] 12849; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12850; AVX2-FP-NEXT: vmovaps 544(%rdi), %xmm8 12851; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm2[3] 12852; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] 12853; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 12854; AVX2-FP-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] 12855; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 12856; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 12857; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12858; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 12859; AVX2-FP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] 12860; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] 12861; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 12862; AVX2-FP-NEXT: vbroadcastss 664(%rdi), %ymm6 12863; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] 12864; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 12865; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12866; AVX2-FP-NEXT: vmovaps 768(%rdi), %xmm1 12867; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3] 12868; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] 12869; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 12870; AVX2-FP-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] 12871; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm3 12872; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] 12873; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 12874; AVX2-FP-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 12875; AVX2-FP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] 12876; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] 12877; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 12878; AVX2-FP-NEXT: vbroadcastss 888(%rdi), %ymm6 12879; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] 12880; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 12881; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12882; AVX2-FP-NEXT: vmovaps 992(%rdi), %xmm2 12883; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm4[3] 12884; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] 12885; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 12886; AVX2-FP-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] 12887; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4 12888; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] 12889; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 12890; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 12891; AVX2-FP-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] 12892; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] 12893; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] 12894; AVX2-FP-NEXT: vbroadcastss 1112(%rdi), %ymm6 12895; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] 12896; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm4[4,5,6,7] 12897; AVX2-FP-NEXT: vmovaps 1216(%rdi), %xmm3 12898; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm14[3] 12899; AVX2-FP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] 12900; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 12901; AVX2-FP-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] 12902; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 12903; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] 12904; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7] 12905; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] 12906; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] 12907; AVX2-FP-NEXT: vbroadcastss 1336(%rdi), %ymm10 12908; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] 12909; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] 12910; AVX2-FP-NEXT: vmovaps 1440(%rdi), %xmm4 12911; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] 12912; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] 12913; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload 12914; AVX2-FP-NEXT: # ymm10 = mem[1,0,2,3,5,4,6,7] 12915; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm10 12916; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] 12917; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload 12918; AVX2-FP-NEXT: # ymm10 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] 12919; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0,3,3,5,4,7,7] 12920; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] 12921; AVX2-FP-NEXT: vbroadcastss 1560(%rdi), %ymm12 12922; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] 12923; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm10[4,5,6,7] 12924; AVX2-FP-NEXT: vmovaps 1664(%rdi), %xmm14 12925; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm5[3] 12926; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] 12927; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 12928; AVX2-FP-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] 12929; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 12930; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] 12931; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12932; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 12933; AVX2-FP-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] 12934; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] 12935; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] 12936; AVX2-FP-NEXT: vbroadcastss 1784(%rdi), %ymm12 12937; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm12[7] 12938; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm5[4,5,6,7] 12939; AVX2-FP-NEXT: vbroadcastss 136(%rdi), %xmm0 12940; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 12941; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 12942; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 12943; AVX2-FP-NEXT: vpermps 192(%rdi), %ymm15, %ymm5 12944; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] 12945; AVX2-FP-NEXT: vbroadcastss 80(%rdi), %ymm5 12946; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3] 12947; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload 12948; AVX2-FP-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] 12949; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 12950; AVX2-FP-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] 12951; AVX2-FP-NEXT: vextractf128 $1, %ymm11, %xmm11 12952; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] 12953; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm0[4,5,6,7] 12954; AVX2-FP-NEXT: vbroadcastss 360(%rdi), %xmm0 12955; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 12956; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 12957; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 12958; AVX2-FP-NEXT: vpermps 416(%rdi), %ymm15, %ymm5 12959; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] 12960; AVX2-FP-NEXT: vbroadcastss 304(%rdi), %ymm5 12961; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm13[3] 12962; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload 12963; AVX2-FP-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] 12964; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload 12965; AVX2-FP-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] 12966; AVX2-FP-NEXT: vextractf128 $1, %ymm13, %xmm13 12967; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3] 12968; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm0[4,5,6,7] 12969; AVX2-FP-NEXT: vbroadcastss 584(%rdi), %xmm0 12970; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 12971; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 12972; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 12973; AVX2-FP-NEXT: vpermps 640(%rdi), %ymm15, %ymm5 12974; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] 12975; AVX2-FP-NEXT: vbroadcastss 528(%rdi), %ymm5 12976; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3] 12977; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload 12978; AVX2-FP-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] 12979; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload 12980; AVX2-FP-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] 12981; AVX2-FP-NEXT: vextractf128 $1, %ymm8, %xmm8 12982; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] 12983; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm0[4,5,6,7] 12984; AVX2-FP-NEXT: vbroadcastss 808(%rdi), %xmm0 12985; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 12986; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 12987; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 12988; AVX2-FP-NEXT: vpermps 864(%rdi), %ymm15, %ymm5 12989; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] 12990; AVX2-FP-NEXT: vbroadcastss 752(%rdi), %ymm5 12991; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] 12992; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 12993; AVX2-FP-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] 12994; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 12995; AVX2-FP-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] 12996; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 12997; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] 12998; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12999; AVX2-FP-NEXT: vbroadcastss 1032(%rdi), %xmm1 13000; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 13001; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] 13002; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 13003; AVX2-FP-NEXT: vpermps 1088(%rdi), %ymm15, %ymm5 13004; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] 13005; AVX2-FP-NEXT: vbroadcastss 976(%rdi), %ymm5 13006; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] 13007; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 13008; AVX2-FP-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] 13009; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 13010; AVX2-FP-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] 13011; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 13012; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] 13013; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 13014; AVX2-FP-NEXT: vbroadcastss 1256(%rdi), %xmm2 13015; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 13016; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 13017; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 13018; AVX2-FP-NEXT: vpermps 1312(%rdi), %ymm15, %ymm5 13019; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] 13020; AVX2-FP-NEXT: vbroadcastss 1200(%rdi), %ymm5 13021; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] 13022; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 13023; AVX2-FP-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] 13024; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 13025; AVX2-FP-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] 13026; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 13027; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] 13028; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 13029; AVX2-FP-NEXT: vbroadcastss 1480(%rdi), %xmm3 13030; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 13031; AVX2-FP-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] 13032; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 13033; AVX2-FP-NEXT: vpermps 1536(%rdi), %ymm15, %ymm5 13034; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] 13035; AVX2-FP-NEXT: vbroadcastss 1424(%rdi), %ymm5 13036; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] 13037; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 13038; AVX2-FP-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] 13039; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 13040; AVX2-FP-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] 13041; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 13042; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] 13043; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 13044; AVX2-FP-NEXT: vbroadcastss 1704(%rdi), %xmm4 13045; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 13046; AVX2-FP-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] 13047; AVX2-FP-NEXT: vpermps 1760(%rdi), %ymm15, %ymm5 13048; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 13049; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 13050; AVX2-FP-NEXT: vbroadcastss 1648(%rdi), %ymm5 13051; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm14[3] 13052; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload 13053; AVX2-FP-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] 13054; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload 13055; AVX2-FP-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] 13056; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm14 13057; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3] 13058; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 13059; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13060; AVX2-FP-NEXT: vmovaps %ymm5, 192(%rsi) 13061; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13062; AVX2-FP-NEXT: vmovaps %ymm5, 128(%rsi) 13063; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13064; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rsi) 13065; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13066; AVX2-FP-NEXT: vmovaps %ymm5, (%rsi) 13067; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13068; AVX2-FP-NEXT: vmovaps %ymm5, 224(%rsi) 13069; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13070; AVX2-FP-NEXT: vmovaps %ymm5, 160(%rsi) 13071; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13072; AVX2-FP-NEXT: vmovaps %ymm5, 96(%rsi) 13073; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13074; AVX2-FP-NEXT: vmovaps %ymm5, 32(%rsi) 13075; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13076; AVX2-FP-NEXT: vmovaps %ymm5, 192(%rdx) 13077; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13078; AVX2-FP-NEXT: vmovaps %ymm5, 128(%rdx) 13079; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13080; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rdx) 13081; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13082; AVX2-FP-NEXT: vmovaps %ymm5, (%rdx) 13083; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13084; AVX2-FP-NEXT: vmovaps %ymm5, 224(%rdx) 13085; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13086; AVX2-FP-NEXT: vmovaps %ymm5, 160(%rdx) 13087; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13088; AVX2-FP-NEXT: vmovaps %ymm5, 96(%rdx) 13089; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13090; AVX2-FP-NEXT: vmovaps %ymm5, 32(%rdx) 13091; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13092; AVX2-FP-NEXT: vmovaps %ymm5, 192(%rcx) 13093; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13094; AVX2-FP-NEXT: vmovaps %ymm5, 128(%rcx) 13095; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13096; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rcx) 13097; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13098; AVX2-FP-NEXT: vmovaps %ymm5, (%rcx) 13099; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13100; AVX2-FP-NEXT: vmovaps %ymm5, 224(%rcx) 13101; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13102; AVX2-FP-NEXT: vmovaps %ymm5, 160(%rcx) 13103; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13104; AVX2-FP-NEXT: vmovaps %ymm5, 96(%rcx) 13105; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13106; AVX2-FP-NEXT: vmovaps %ymm5, 32(%rcx) 13107; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13108; AVX2-FP-NEXT: vmovaps %ymm5, (%r8) 13109; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13110; AVX2-FP-NEXT: vmovaps %ymm5, 64(%r8) 13111; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13112; AVX2-FP-NEXT: vmovaps %ymm5, 128(%r8) 13113; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13114; AVX2-FP-NEXT: vmovaps %ymm5, 192(%r8) 13115; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13116; AVX2-FP-NEXT: vmovaps %ymm5, 224(%r8) 13117; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13118; AVX2-FP-NEXT: vmovaps %ymm5, 160(%r8) 13119; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13120; AVX2-FP-NEXT: vmovaps %ymm5, 96(%r8) 13121; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13122; AVX2-FP-NEXT: vmovaps %ymm5, 32(%r8) 13123; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13124; AVX2-FP-NEXT: vmovaps %ymm5, 224(%r9) 13125; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13126; AVX2-FP-NEXT: vmovaps %ymm5, 192(%r9) 13127; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13128; AVX2-FP-NEXT: vmovaps %ymm5, 160(%r9) 13129; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13130; AVX2-FP-NEXT: vmovaps %ymm5, 128(%r9) 13131; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13132; AVX2-FP-NEXT: vmovaps %ymm5, 96(%r9) 13133; AVX2-FP-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload 13134; AVX2-FP-NEXT: vmovaps %ymm5, 64(%r9) 13135; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13136; AVX2-FP-NEXT: vmovaps %ymm5, 32(%r9) 13137; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13138; AVX2-FP-NEXT: vmovaps %ymm5, (%r9) 13139; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 13140; AVX2-FP-NEXT: vmovaps %ymm12, 224(%rax) 13141; AVX2-FP-NEXT: vmovaps %ymm10, 192(%rax) 13142; AVX2-FP-NEXT: vmovaps %ymm6, 160(%rax) 13143; AVX2-FP-NEXT: vmovaps %ymm7, 128(%rax) 13144; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13145; AVX2-FP-NEXT: vmovaps %ymm5, 96(%rax) 13146; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13147; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rax) 13148; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13149; AVX2-FP-NEXT: vmovaps %ymm5, 32(%rax) 13150; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13151; AVX2-FP-NEXT: vmovaps %ymm5, (%rax) 13152; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 13153; AVX2-FP-NEXT: vmovaps %ymm4, 224(%rax) 13154; AVX2-FP-NEXT: vmovaps %ymm3, 192(%rax) 13155; AVX2-FP-NEXT: vmovaps %ymm2, 160(%rax) 13156; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rax) 13157; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rax) 13158; AVX2-FP-NEXT: vmovaps %ymm8, 64(%rax) 13159; AVX2-FP-NEXT: vmovaps %ymm13, 32(%rax) 13160; AVX2-FP-NEXT: vmovaps %ymm11, (%rax) 13161; AVX2-FP-NEXT: addq $2648, %rsp # imm = 0xA58 13162; AVX2-FP-NEXT: vzeroupper 13163; AVX2-FP-NEXT: retq 13164; 13165; AVX2-FCP-LABEL: load_i32_stride7_vf64: 13166; AVX2-FCP: # %bb.0: 13167; AVX2-FCP-NEXT: subq $2648, %rsp # imm = 0xA58 13168; AVX2-FCP-NEXT: vmovdqa 1216(%rdi), %ymm9 13169; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13170; AVX2-FCP-NEXT: vmovdqa 1152(%rdi), %ymm4 13171; AVX2-FCP-NEXT: vmovdqa 1120(%rdi), %ymm5 13172; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13173; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm13 13174; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm6 13175; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm7 13176; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm8 13177; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13178; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm10 13179; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm11 13180; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] 13181; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm10[6],ymm11[7] 13182; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13183; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13184; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 13185; AVX2-FCP-NEXT: vpbroadcastq 304(%rdi), %ymm2 13186; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] 13187; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 13188; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm2 13189; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm3 13190; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13191; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 13192; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 13193; AVX2-FCP-NEXT: vpbroadcastd 420(%rdi), %ymm3 13194; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 13195; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 13196; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13197; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] 13198; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13199; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm8 13200; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13201; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 13202; AVX2-FCP-NEXT: vpbroadcastq 752(%rdi), %ymm2 13203; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] 13204; AVX2-FCP-NEXT: vmovdqa %ymm13, %ymm6 13205; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 13206; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %xmm2 13207; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %xmm3 13208; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13209; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 13210; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 13211; AVX2-FCP-NEXT: vpbroadcastd 868(%rdi), %ymm3 13212; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 13213; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 13214; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13215; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] 13216; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13217; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 13218; AVX2-FCP-NEXT: vpbroadcastq 1200(%rdi), %ymm2 13219; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] 13220; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 13221; AVX2-FCP-NEXT: vmovdqa 1248(%rdi), %xmm2 13222; AVX2-FCP-NEXT: vmovdqa 1280(%rdi), %xmm3 13223; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13224; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 13225; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 13226; AVX2-FCP-NEXT: vpbroadcastd 1316(%rdi), %ymm3 13227; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 13228; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 13229; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13230; AVX2-FCP-NEXT: vmovdqa 1600(%rdi), %ymm1 13231; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13232; AVX2-FCP-NEXT: vmovdqa 1568(%rdi), %ymm5 13233; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6],ymm5[7] 13234; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13235; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 13236; AVX2-FCP-NEXT: vmovdqa 1664(%rdi), %ymm3 13237; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13238; AVX2-FCP-NEXT: vpbroadcastq 1648(%rdi), %ymm2 13239; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 13240; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 13241; AVX2-FCP-NEXT: vmovdqa 1696(%rdi), %xmm2 13242; AVX2-FCP-NEXT: vmovdqa 1728(%rdi), %xmm3 13243; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13244; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 13245; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 13246; AVX2-FCP-NEXT: vpbroadcastd 1764(%rdi), %ymm3 13247; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 13248; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 13249; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13250; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2 13251; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13252; AVX2-FCP-NEXT: vpbroadcastq 80(%rdi), %ymm1 13253; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 13254; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 13255; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13256; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 13257; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13258; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] 13259; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 13260; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 13261; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm2 13262; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm3 13263; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13264; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 13265; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 13266; AVX2-FCP-NEXT: vpbroadcastd 196(%rdi), %ymm3 13267; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 13268; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 13269; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13270; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm2 13271; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13272; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm1 13273; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13274; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] 13275; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 13276; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm3 13277; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13278; AVX2-FCP-NEXT: vpbroadcastq 528(%rdi), %ymm2 13279; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 13280; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 13281; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %xmm2 13282; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %xmm3 13283; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13284; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 13285; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 13286; AVX2-FCP-NEXT: vpbroadcastd 644(%rdi), %ymm3 13287; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 13288; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 13289; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13290; AVX2-FCP-NEXT: vmovdqa 928(%rdi), %ymm2 13291; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13292; AVX2-FCP-NEXT: vmovdqa 896(%rdi), %ymm1 13293; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13294; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] 13295; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 13296; AVX2-FCP-NEXT: vmovdqa 992(%rdi), %ymm12 13297; AVX2-FCP-NEXT: vpbroadcastq 976(%rdi), %ymm2 13298; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] 13299; AVX2-FCP-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill 13300; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 13301; AVX2-FCP-NEXT: vmovdqa 1024(%rdi), %xmm2 13302; AVX2-FCP-NEXT: vmovdqa 1056(%rdi), %xmm3 13303; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13304; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 13305; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 13306; AVX2-FCP-NEXT: vpbroadcastd 1092(%rdi), %ymm3 13307; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 13308; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 13309; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13310; AVX2-FCP-NEXT: vmovdqa 1376(%rdi), %ymm15 13311; AVX2-FCP-NEXT: vmovdqa 1344(%rdi), %ymm14 13312; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] 13313; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13314; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13315; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 13316; AVX2-FCP-NEXT: vmovdqa 1440(%rdi), %ymm9 13317; AVX2-FCP-NEXT: vpbroadcastq 1424(%rdi), %ymm1 13318; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] 13319; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13320; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 13321; AVX2-FCP-NEXT: vmovdqa 1472(%rdi), %xmm1 13322; AVX2-FCP-NEXT: vmovdqa 1504(%rdi), %xmm2 13323; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13324; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] 13325; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 13326; AVX2-FCP-NEXT: vpbroadcastd 1540(%rdi), %ymm2 13327; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 13328; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 13329; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13330; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] 13331; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 13332; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm1 13333; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13334; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm0 13335; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13336; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27] 13337; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] 13338; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] 13339; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm0 13340; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13341; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 13342; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7] 13343; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] 13344; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13345; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] 13346; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] 13347; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 13348; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 13349; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13350; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 13351; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 13352; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm2 13353; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13354; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm3 13355; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13356; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] 13357; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 13358; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 13359; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm2 13360; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13361; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13362; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] 13363; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] 13364; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13365; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] 13366; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 13367; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 13368; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13369; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 13370; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 13371; AVX2-FCP-NEXT: vmovdqa 1280(%rdi), %ymm3 13372; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13373; AVX2-FCP-NEXT: vmovdqa 1248(%rdi), %ymm2 13374; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13375; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 13376; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 13377; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 13378; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %ymm2 13379; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13380; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 13381; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7] 13382; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload 13383; AVX2-FCP-NEXT: # ymm7 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] 13384; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13385; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6],ymm7[7] 13386; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 13387; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 13388; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13389; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 13390; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 13391; AVX2-FCP-NEXT: vmovdqa 1728(%rdi), %ymm3 13392; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13393; AVX2-FCP-NEXT: vmovdqa 1696(%rdi), %ymm2 13394; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13395; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 13396; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 13397; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 13398; AVX2-FCP-NEXT: vmovdqa 1632(%rdi), %ymm2 13399; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13400; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 13401; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] 13402; AVX2-FCP-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 13403; AVX2-FCP-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7] 13404; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13405; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6],ymm5[7] 13406; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 13407; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 13408; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13409; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 13410; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 13411; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm3 13412; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13413; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm2 13414; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13415; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 13416; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 13417; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 13418; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm2 13419; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13420; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 13421; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] 13422; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 13423; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 13424; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] 13425; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13426; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] 13427; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 13428; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 13429; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13430; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 13431; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 13432; AVX2-FCP-NEXT: vmovdqa 1056(%rdi), %ymm3 13433; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13434; AVX2-FCP-NEXT: vmovdqa 1024(%rdi), %ymm2 13435; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13436; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 13437; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 13438; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 13439; AVX2-FCP-NEXT: vmovdqa 960(%rdi), %ymm2 13440; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13441; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3],ymm12[4,5],ymm2[6,7] 13442; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 13443; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 13444; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] 13445; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13446; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] 13447; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 13448; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 13449; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13450; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 13451; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 13452; AVX2-FCP-NEXT: vmovdqa 1504(%rdi), %ymm3 13453; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13454; AVX2-FCP-NEXT: vmovdqa 1472(%rdi), %ymm2 13455; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13456; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] 13457; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 13458; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 13459; AVX2-FCP-NEXT: vmovdqa 1408(%rdi), %ymm2 13460; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13461; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] 13462; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] 13463; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13464; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] 13465; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 13466; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 13467; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13468; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] 13469; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 13470; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm15 13471; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 13472; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm15[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] 13473; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13474; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13475; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] 13476; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] 13477; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 13478; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 13479; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] 13480; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13481; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 13482; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 13483; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] 13484; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13485; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] 13486; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 13487; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 13488; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13489; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm0 13490; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13491; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 13492; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 13493; AVX2-FCP-NEXT: vpbroadcastd 232(%rdi), %xmm1 13494; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm5 13495; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] 13496; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 13497; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 13498; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 13499; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] 13500; AVX2-FCP-NEXT: vpbroadcastd 428(%rdi), %ymm2 13501; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 13502; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 13503; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13504; AVX2-FCP-NEXT: vmovdqa 752(%rdi), %xmm0 13505; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13506; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 13507; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 13508; AVX2-FCP-NEXT: vpbroadcastd 680(%rdi), %xmm1 13509; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %xmm2 13510; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13511; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] 13512; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 13513; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 13514; AVX2-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload 13515; AVX2-FCP-NEXT: # ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] 13516; AVX2-FCP-NEXT: vpbroadcastd 876(%rdi), %ymm2 13517; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 13518; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 13519; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13520; AVX2-FCP-NEXT: vmovdqa 1200(%rdi), %xmm0 13521; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13522; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 13523; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 13524; AVX2-FCP-NEXT: vpbroadcastd 1128(%rdi), %xmm1 13525; AVX2-FCP-NEXT: vmovdqa 1152(%rdi), %xmm2 13526; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13527; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] 13528; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 13529; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 13530; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13531; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] 13532; AVX2-FCP-NEXT: vpbroadcastd 1324(%rdi), %ymm2 13533; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 13534; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 13535; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13536; AVX2-FCP-NEXT: vmovdqa 1648(%rdi), %xmm0 13537; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13538; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 13539; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 13540; AVX2-FCP-NEXT: vpbroadcastd 1576(%rdi), %xmm1 13541; AVX2-FCP-NEXT: vmovdqa 1600(%rdi), %xmm2 13542; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] 13543; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 13544; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13545; AVX2-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 13546; AVX2-FCP-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] 13547; AVX2-FCP-NEXT: vpbroadcastd 1772(%rdi), %ymm6 13548; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] 13549; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 13550; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13551; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 13552; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] 13553; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 13554; AVX2-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm1 13555; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 13556; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13557; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] 13558; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 13559; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] 13560; AVX2-FCP-NEXT: vpbroadcastd 204(%rdi), %ymm6 13561; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] 13562; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 13563; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13564; AVX2-FCP-NEXT: vmovdqa 528(%rdi), %xmm0 13565; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13566; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 13567; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 13568; AVX2-FCP-NEXT: vpbroadcastd 456(%rdi), %xmm1 13569; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %xmm4 13570; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13571; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] 13572; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 13573; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13574; AVX2-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 13575; AVX2-FCP-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] 13576; AVX2-FCP-NEXT: vpbroadcastd 652(%rdi), %ymm15 13577; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] 13578; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 13579; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13580; AVX2-FCP-NEXT: vmovdqa 976(%rdi), %xmm0 13581; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13582; AVX2-FCP-NEXT: vpalignr $8, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload 13583; AVX2-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 13584; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] 13585; AVX2-FCP-NEXT: vpbroadcastd 904(%rdi), %xmm15 13586; AVX2-FCP-NEXT: vmovdqa 928(%rdi), %xmm12 13587; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3] 13588; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] 13589; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13590; AVX2-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload 13591; AVX2-FCP-NEXT: # ymm15 = ymm1[0],mem[0],ymm1[2],mem[2] 13592; AVX2-FCP-NEXT: vpbroadcastd 1100(%rdi), %ymm14 13593; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] 13594; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] 13595; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13596; AVX2-FCP-NEXT: vmovdqa 1424(%rdi), %xmm0 13597; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 13598; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 13599; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] 13600; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] 13601; AVX2-FCP-NEXT: vpbroadcastd 1352(%rdi), %xmm15 13602; AVX2-FCP-NEXT: vmovdqa 1376(%rdi), %xmm0 13603; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] 13604; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] 13605; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 13606; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13607; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] 13608; AVX2-FCP-NEXT: vpbroadcastd 1548(%rdi), %ymm13 13609; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] 13610; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] 13611; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13612; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 13613; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload 13614; AVX2-FCP-NEXT: # ymm13 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] 13615; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] 13616; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3] 13617; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] 13618; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] 13619; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7] 13620; AVX2-FCP-NEXT: vbroadcastss 432(%rdi), %ymm14 13621; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] 13622; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7] 13623; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13624; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 13625; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload 13626; AVX2-FCP-NEXT: # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] 13627; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 13628; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] 13629; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] 13630; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] 13631; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] 13632; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 13633; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,2],ymm10[1,3],ymm3[4,6],ymm10[5,7] 13634; AVX2-FCP-NEXT: vbroadcastss 880(%rdi), %ymm13 13635; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7] 13636; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] 13637; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13638; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 13639; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload 13640; AVX2-FCP-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] 13641; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 13642; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] 13643; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] 13644; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] 13645; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] 13646; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 13647; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm8[1,3],ymm14[4,6],ymm8[5,7] 13648; AVX2-FCP-NEXT: vmovaps %ymm8, %ymm13 13649; AVX2-FCP-NEXT: vbroadcastss 1328(%rdi), %ymm5 13650; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] 13651; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] 13652; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13653; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 13654; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 13655; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] 13656; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] 13657; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] 13658; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] 13659; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] 13660; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 13661; AVX2-FCP-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 13662; AVX2-FCP-NEXT: # ymm3 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] 13663; AVX2-FCP-NEXT: vbroadcastss 1776(%rdi), %ymm4 13664; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] 13665; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] 13666; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13667; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm6[1],ymm7[2,3,4,5,6,7] 13668; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] 13669; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] 13670; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] 13671; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] 13672; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,2],ymm1[1,3],ymm11[4,6],ymm1[5,7] 13673; AVX2-FCP-NEXT: vbroadcastss 1552(%rdi), %ymm3 13674; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 13675; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 13676; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13677; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload 13678; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 13679; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] 13680; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] 13681; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] 13682; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 13683; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 13684; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 13685; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 13686; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm8[1,3],ymm11[4,6],ymm8[5,7] 13687; AVX2-FCP-NEXT: vbroadcastss 1104(%rdi), %ymm2 13688; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 13689; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 13690; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13691; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 13692; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 13693; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] 13694; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 13695; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] 13696; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] 13697; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 13698; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 13699; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 13700; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 13701; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm7[1,3],ymm6[4,6],ymm7[5,7] 13702; AVX2-FCP-NEXT: vbroadcastss 656(%rdi), %ymm2 13703; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 13704; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 13705; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13706; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 13707; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 13708; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] 13709; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 13710; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] 13711; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] 13712; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] 13713; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 13714; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 13715; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 13716; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm12[1,3],ymm4[4,6],ymm12[5,7] 13717; AVX2-FCP-NEXT: vbroadcastss 208(%rdi), %ymm2 13718; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 13719; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 13720; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13721; AVX2-FCP-NEXT: vbroadcastss 100(%rdi), %xmm0 13722; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm3 13723; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm0[3] 13724; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm0 = [4,3,0,0] 13725; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 13726; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 13727; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] 13728; AVX2-FCP-NEXT: vpermps %ymm2, %ymm0, %ymm2 13729; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 13730; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm15 = [0,7,0,7,0,7,0,7] 13731; AVX2-FCP-NEXT: vpermps %ymm4, %ymm15, %ymm2 13732; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] 13733; AVX2-FCP-NEXT: vbroadcastss 212(%rdi), %ymm4 13734; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] 13735; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 13736; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13737; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 13738; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 13739; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] 13740; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm2 13741; AVX2-FCP-NEXT: vbroadcastss 324(%rdi), %xmm4 13742; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm1 13743; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] 13744; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] 13745; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload 13746; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] 13747; AVX2-FCP-NEXT: vbroadcastss 436(%rdi), %ymm5 13748; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] 13749; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 13750; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13751; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 13752; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 13753; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] 13754; AVX2-FCP-NEXT: vpermps %ymm2, %ymm0, %ymm4 13755; AVX2-FCP-NEXT: vbroadcastss 548(%rdi), %xmm5 13756; AVX2-FCP-NEXT: vmovaps 512(%rdi), %xmm2 13757; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] 13758; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] 13759; AVX2-FCP-NEXT: vpermps %ymm6, %ymm15, %ymm5 13760; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] 13761; AVX2-FCP-NEXT: vbroadcastss 660(%rdi), %ymm6 13762; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] 13763; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] 13764; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13765; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 13766; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 13767; AVX2-FCP-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] 13768; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm5 13769; AVX2-FCP-NEXT: vbroadcastss 772(%rdi), %xmm6 13770; AVX2-FCP-NEXT: vmovaps 736(%rdi), %xmm4 13771; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3] 13772; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] 13773; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm6 # 32-byte Folded Reload 13774; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] 13775; AVX2-FCP-NEXT: vbroadcastss 884(%rdi), %ymm7 13776; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] 13777; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] 13778; AVX2-FCP-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill 13779; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 13780; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 13781; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] 13782; AVX2-FCP-NEXT: vpermps %ymm5, %ymm0, %ymm6 13783; AVX2-FCP-NEXT: vbroadcastss 996(%rdi), %xmm7 13784; AVX2-FCP-NEXT: vmovaps 960(%rdi), %xmm5 13785; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm7[3] 13786; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] 13787; AVX2-FCP-NEXT: vpermps %ymm11, %ymm15, %ymm7 13788; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] 13789; AVX2-FCP-NEXT: vbroadcastss 1108(%rdi), %ymm8 13790; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] 13791; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 13792; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13793; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 13794; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 13795; AVX2-FCP-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] 13796; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm6 13797; AVX2-FCP-NEXT: vbroadcastss 1220(%rdi), %xmm7 13798; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %xmm10 13799; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] 13800; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] 13801; AVX2-FCP-NEXT: vpermps %ymm14, %ymm15, %ymm7 13802; AVX2-FCP-NEXT: vmovaps %ymm13, %ymm11 13803; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7] 13804; AVX2-FCP-NEXT: vbroadcastss 1332(%rdi), %ymm8 13805; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] 13806; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 13807; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13808; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 13809; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 13810; AVX2-FCP-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] 13811; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm6 13812; AVX2-FCP-NEXT: vbroadcastss 1444(%rdi), %xmm7 13813; AVX2-FCP-NEXT: vmovaps 1408(%rdi), %xmm14 13814; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] 13815; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] 13816; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 13817; AVX2-FCP-NEXT: vpermps %ymm13, %ymm15, %ymm7 13818; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 13819; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] 13820; AVX2-FCP-NEXT: vbroadcastss 1556(%rdi), %ymm8 13821; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] 13822; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 13823; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13824; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 13825; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 13826; AVX2-FCP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] 13827; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm6 13828; AVX2-FCP-NEXT: vbroadcastss 1668(%rdi), %xmm7 13829; AVX2-FCP-NEXT: vmovaps 1632(%rdi), %xmm0 13830; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] 13831; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] 13832; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload 13833; AVX2-FCP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload 13834; AVX2-FCP-NEXT: # ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] 13835; AVX2-FCP-NEXT: vbroadcastss 1780(%rdi), %ymm8 13836; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] 13837; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 13838; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13839; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload 13840; AVX2-FCP-NEXT: # ymm6 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] 13841; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm12 = [1,0,3,3,1,0,7,7] 13842; AVX2-FCP-NEXT: vpermps %ymm6, %ymm12, %ymm6 13843; AVX2-FCP-NEXT: vbroadcastss 216(%rdi), %ymm7 13844; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] 13845; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm7 13846; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] 13847; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] 13848; AVX2-FCP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload 13849; AVX2-FCP-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] 13850; AVX2-FCP-NEXT: vextractf128 $1, %ymm8, %xmm8 13851; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] 13852; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] 13853; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13854; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm8 13855; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] 13856; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] 13857; AVX2-FCP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 13858; AVX2-FCP-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] 13859; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm3 13860; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] 13861; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 13862; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 13863; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] 13864; AVX2-FCP-NEXT: vpermps %ymm3, %ymm12, %ymm3 13865; AVX2-FCP-NEXT: vbroadcastss 440(%rdi), %ymm6 13866; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] 13867; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 13868; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13869; AVX2-FCP-NEXT: vmovaps 544(%rdi), %xmm6 13870; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm2[3] 13871; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] 13872; AVX2-FCP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 13873; AVX2-FCP-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] 13874; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 13875; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 13876; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 13877; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 13878; AVX2-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] 13879; AVX2-FCP-NEXT: vpermps %ymm2, %ymm12, %ymm2 13880; AVX2-FCP-NEXT: vbroadcastss 664(%rdi), %ymm3 13881; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 13882; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 13883; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13884; AVX2-FCP-NEXT: vmovaps 768(%rdi), %xmm3 13885; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm4[3] 13886; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] 13887; AVX2-FCP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 13888; AVX2-FCP-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] 13889; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 13890; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 13891; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 13892; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 13893; AVX2-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] 13894; AVX2-FCP-NEXT: vpermps %ymm2, %ymm12, %ymm2 13895; AVX2-FCP-NEXT: vbroadcastss 888(%rdi), %ymm4 13896; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] 13897; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 13898; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13899; AVX2-FCP-NEXT: vmovaps 992(%rdi), %xmm1 13900; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm5[3] 13901; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] 13902; AVX2-FCP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 13903; AVX2-FCP-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] 13904; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm4 13905; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] 13906; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 13907; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 13908; AVX2-FCP-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] 13909; AVX2-FCP-NEXT: vpermps %ymm4, %ymm12, %ymm4 13910; AVX2-FCP-NEXT: vbroadcastss 1112(%rdi), %ymm5 13911; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] 13912; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 13913; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13914; AVX2-FCP-NEXT: vmovaps 1216(%rdi), %xmm2 13915; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm10[3] 13916; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] 13917; AVX2-FCP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 13918; AVX2-FCP-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] 13919; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 13920; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] 13921; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload 13922; AVX2-FCP-NEXT: # ymm5 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] 13923; AVX2-FCP-NEXT: vpermps %ymm5, %ymm12, %ymm5 13924; AVX2-FCP-NEXT: vbroadcastss 1336(%rdi), %ymm10 13925; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] 13926; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm5[4,5,6,7] 13927; AVX2-FCP-NEXT: vmovaps 1440(%rdi), %xmm4 13928; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm4[0,1,2],xmm14[3] 13929; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,3,2] 13930; AVX2-FCP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload 13931; AVX2-FCP-NEXT: # ymm14 = mem[1,0,2,3,5,4,6,7] 13932; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm14 13933; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] 13934; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm9[0],ymm13[1],ymm9[2,3,4],ymm13[5],ymm9[6,7] 13935; AVX2-FCP-NEXT: vpermps %ymm14, %ymm12, %ymm14 13936; AVX2-FCP-NEXT: vbroadcastss 1560(%rdi), %ymm11 13937; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5,6],ymm11[7] 13938; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] 13939; AVX2-FCP-NEXT: vmovaps 1664(%rdi), %xmm14 13940; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] 13941; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] 13942; AVX2-FCP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload 13943; AVX2-FCP-NEXT: # ymm11 = mem[1,0,2,3,5,4,6,7] 13944; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm11 13945; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1],xmm0[2,3] 13946; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 13947; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload 13948; AVX2-FCP-NEXT: # ymm11 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] 13949; AVX2-FCP-NEXT: vpermps %ymm11, %ymm12, %ymm11 13950; AVX2-FCP-NEXT: vbroadcastss 1784(%rdi), %ymm12 13951; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] 13952; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm11[4,5,6,7] 13953; AVX2-FCP-NEXT: vbroadcastss 136(%rdi), %xmm0 13954; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 13955; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 13956; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 13957; AVX2-FCP-NEXT: vpermps 192(%rdi), %ymm15, %ymm11 13958; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] 13959; AVX2-FCP-NEXT: vbroadcastss 80(%rdi), %ymm11 13960; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm7[3] 13961; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload 13962; AVX2-FCP-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] 13963; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 13964; AVX2-FCP-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] 13965; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm11 13966; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1],xmm7[2,3] 13967; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] 13968; AVX2-FCP-NEXT: vbroadcastss 360(%rdi), %xmm0 13969; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 13970; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 13971; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 13972; AVX2-FCP-NEXT: vpermps 416(%rdi), %ymm15, %ymm11 13973; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] 13974; AVX2-FCP-NEXT: vbroadcastss 304(%rdi), %ymm11 13975; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm8[3] 13976; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload 13977; AVX2-FCP-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] 13978; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload 13979; AVX2-FCP-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] 13980; AVX2-FCP-NEXT: vextractf128 $1, %ymm13, %xmm13 13981; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] 13982; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm0[4,5,6,7] 13983; AVX2-FCP-NEXT: vbroadcastss 584(%rdi), %xmm0 13984; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 13985; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 13986; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 13987; AVX2-FCP-NEXT: vpermps 640(%rdi), %ymm15, %ymm11 13988; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] 13989; AVX2-FCP-NEXT: vbroadcastss 528(%rdi), %ymm11 13990; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm11[0,1,2],xmm6[3] 13991; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload 13992; AVX2-FCP-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] 13993; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 13994; AVX2-FCP-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] 13995; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm11 13996; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] 13997; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] 13998; AVX2-FCP-NEXT: vbroadcastss 808(%rdi), %xmm0 13999; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 14000; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 14001; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 14002; AVX2-FCP-NEXT: vpermps 864(%rdi), %ymm15, %ymm11 14003; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] 14004; AVX2-FCP-NEXT: vbroadcastss 752(%rdi), %ymm11 14005; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3] 14006; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload 14007; AVX2-FCP-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] 14008; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 14009; AVX2-FCP-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] 14010; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm11 14011; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3] 14012; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] 14013; AVX2-FCP-NEXT: vbroadcastss 1032(%rdi), %xmm0 14014; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 14015; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] 14016; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 14017; AVX2-FCP-NEXT: vpermps 1088(%rdi), %ymm15, %ymm11 14018; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] 14019; AVX2-FCP-NEXT: vbroadcastss 976(%rdi), %ymm11 14020; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] 14021; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload 14022; AVX2-FCP-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] 14023; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 14024; AVX2-FCP-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] 14025; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm11 14026; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] 14027; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 14028; AVX2-FCP-NEXT: vbroadcastss 1256(%rdi), %xmm1 14029; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 14030; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] 14031; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 14032; AVX2-FCP-NEXT: vpermps 1312(%rdi), %ymm15, %ymm11 14033; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] 14034; AVX2-FCP-NEXT: vbroadcastss 1200(%rdi), %ymm11 14035; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] 14036; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload 14037; AVX2-FCP-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] 14038; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 14039; AVX2-FCP-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] 14040; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm11 14041; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] 14042; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 14043; AVX2-FCP-NEXT: vbroadcastss 1480(%rdi), %xmm2 14044; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 14045; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] 14046; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 14047; AVX2-FCP-NEXT: vpermps 1536(%rdi), %ymm15, %ymm11 14048; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] 14049; AVX2-FCP-NEXT: vbroadcastss 1424(%rdi), %ymm11 14050; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] 14051; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload 14052; AVX2-FCP-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] 14053; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 14054; AVX2-FCP-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] 14055; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm11 14056; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] 14057; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] 14058; AVX2-FCP-NEXT: vbroadcastss 1704(%rdi), %xmm4 14059; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 14060; AVX2-FCP-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] 14061; AVX2-FCP-NEXT: vpermps 1760(%rdi), %ymm15, %ymm11 14062; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 14063; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] 14064; AVX2-FCP-NEXT: vbroadcastss 1648(%rdi), %ymm11 14065; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm14[3] 14066; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload 14067; AVX2-FCP-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] 14068; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload 14069; AVX2-FCP-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] 14070; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm14 14071; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] 14072; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] 14073; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 14074; AVX2-FCP-NEXT: vmovaps %ymm11, 192(%rsi) 14075; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 14076; AVX2-FCP-NEXT: vmovaps %ymm11, 128(%rsi) 14077; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 14078; AVX2-FCP-NEXT: vmovaps %ymm11, 64(%rsi) 14079; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 14080; AVX2-FCP-NEXT: vmovaps %ymm11, (%rsi) 14081; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 14082; AVX2-FCP-NEXT: vmovaps %ymm11, 224(%rsi) 14083; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 14084; AVX2-FCP-NEXT: vmovaps %ymm11, 160(%rsi) 14085; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 14086; AVX2-FCP-NEXT: vmovaps %ymm11, 96(%rsi) 14087; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 14088; AVX2-FCP-NEXT: vmovaps %ymm11, 32(%rsi) 14089; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14090; AVX2-FCP-NEXT: vmovaps %ymm9, 192(%rdx) 14091; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14092; AVX2-FCP-NEXT: vmovaps %ymm9, 128(%rdx) 14093; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14094; AVX2-FCP-NEXT: vmovaps %ymm9, 64(%rdx) 14095; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14096; AVX2-FCP-NEXT: vmovaps %ymm9, (%rdx) 14097; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14098; AVX2-FCP-NEXT: vmovaps %ymm9, 224(%rdx) 14099; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 14100; AVX2-FCP-NEXT: vmovaps %ymm11, 160(%rdx) 14101; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 14102; AVX2-FCP-NEXT: vmovaps %ymm11, 96(%rdx) 14103; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 14104; AVX2-FCP-NEXT: vmovaps %ymm11, 32(%rdx) 14105; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14106; AVX2-FCP-NEXT: vmovaps %ymm9, 192(%rcx) 14107; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14108; AVX2-FCP-NEXT: vmovaps %ymm9, 128(%rcx) 14109; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14110; AVX2-FCP-NEXT: vmovaps %ymm9, 64(%rcx) 14111; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14112; AVX2-FCP-NEXT: vmovaps %ymm9, (%rcx) 14113; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14114; AVX2-FCP-NEXT: vmovaps %ymm9, 224(%rcx) 14115; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14116; AVX2-FCP-NEXT: vmovaps %ymm9, 160(%rcx) 14117; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14118; AVX2-FCP-NEXT: vmovaps %ymm9, 96(%rcx) 14119; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14120; AVX2-FCP-NEXT: vmovaps %ymm9, 32(%rcx) 14121; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14122; AVX2-FCP-NEXT: vmovaps %ymm9, (%r8) 14123; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14124; AVX2-FCP-NEXT: vmovaps %ymm9, 64(%r8) 14125; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14126; AVX2-FCP-NEXT: vmovaps %ymm9, 128(%r8) 14127; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14128; AVX2-FCP-NEXT: vmovaps %ymm9, 192(%r8) 14129; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14130; AVX2-FCP-NEXT: vmovaps %ymm9, 224(%r8) 14131; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14132; AVX2-FCP-NEXT: vmovaps %ymm9, 160(%r8) 14133; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14134; AVX2-FCP-NEXT: vmovaps %ymm9, 96(%r8) 14135; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14136; AVX2-FCP-NEXT: vmovaps %ymm9, 32(%r8) 14137; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14138; AVX2-FCP-NEXT: vmovaps %ymm9, 224(%r9) 14139; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14140; AVX2-FCP-NEXT: vmovaps %ymm9, 192(%r9) 14141; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14142; AVX2-FCP-NEXT: vmovaps %ymm9, 160(%r9) 14143; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14144; AVX2-FCP-NEXT: vmovaps %ymm9, 128(%r9) 14145; AVX2-FCP-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload 14146; AVX2-FCP-NEXT: vmovaps %ymm9, 96(%r9) 14147; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14148; AVX2-FCP-NEXT: vmovaps %ymm9, 64(%r9) 14149; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14150; AVX2-FCP-NEXT: vmovaps %ymm9, 32(%r9) 14151; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 14152; AVX2-FCP-NEXT: vmovaps %ymm9, (%r9) 14153; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 14154; AVX2-FCP-NEXT: vmovaps %ymm12, 224(%rax) 14155; AVX2-FCP-NEXT: vmovaps %ymm10, 192(%rax) 14156; AVX2-FCP-NEXT: vmovaps %ymm5, 160(%rax) 14157; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 14158; AVX2-FCP-NEXT: vmovaps %ymm5, 128(%rax) 14159; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 14160; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rax) 14161; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 14162; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rax) 14163; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 14164; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rax) 14165; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 14166; AVX2-FCP-NEXT: vmovaps %ymm5, (%rax) 14167; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 14168; AVX2-FCP-NEXT: vmovaps %ymm4, 224(%rax) 14169; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%rax) 14170; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rax) 14171; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rax) 14172; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rax) 14173; AVX2-FCP-NEXT: vmovaps %ymm8, 64(%rax) 14174; AVX2-FCP-NEXT: vmovaps %ymm13, 32(%rax) 14175; AVX2-FCP-NEXT: vmovaps %ymm7, (%rax) 14176; AVX2-FCP-NEXT: addq $2648, %rsp # imm = 0xA58 14177; AVX2-FCP-NEXT: vzeroupper 14178; AVX2-FCP-NEXT: retq 14179; 14180; AVX512-LABEL: load_i32_stride7_vf64: 14181; AVX512: # %bb.0: 14182; AVX512-NEXT: subq $3400, %rsp # imm = 0xD48 14183; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm2 14184; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm17 14185; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm11 14186; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm7 14187; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm5 14188; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm12 14189; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm6 14190; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm8 14191; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm13 14192; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm20 14193; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm4 14194; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm14 14195; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 14196; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 14197; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3 14198; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 14199; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 14200; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 14201; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 14202; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14203; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 14204; AVX512-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 14205; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 14206; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14207; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 14208; AVX512-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 14209; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 14210; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14211; AVX512-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 14212; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 14213; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14214; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 14215; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 14216; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 14217; AVX512-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 14218; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 14219; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 14220; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 14221; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14222; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 14223; AVX512-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 14224; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 14225; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14226; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3 14227; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 14228; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 14229; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14230; AVX512-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 14231; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 14232; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14233; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 14234; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 14235; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 14236; AVX512-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 14237; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 14238; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 14239; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 14240; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14241; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 14242; AVX512-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 14243; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 14244; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14245; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3 14246; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 14247; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 14248; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14249; AVX512-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 14250; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 14251; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14252; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 14253; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 14254; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 14255; AVX512-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 14256; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 14257; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 14258; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 14259; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14260; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 14261; AVX512-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 14262; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 14263; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14264; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 14265; AVX512-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 14266; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 14267; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14268; AVX512-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 14269; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 14270; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14271; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 14272; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 14273; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 14274; AVX512-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 14275; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 14276; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 14277; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 14278; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14279; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 14280; AVX512-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 14281; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 14282; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14283; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 14284; AVX512-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 14285; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 14286; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14287; AVX512-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 14288; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 14289; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14290; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm3 14291; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm15 14292; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 14293; AVX512-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 14294; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 14295; AVX512-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 14296; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14297; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm9 14298; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm16 14299; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 14300; AVX512-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 14301; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14302; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm0 14303; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm18 14304; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 14305; AVX512-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 14306; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14307; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm1 14308; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm19 14309; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 14310; AVX512-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 14311; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14312; AVX512-NEXT: vmovdqa64 %zmm12, %zmm21 14313; AVX512-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 14314; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 14315; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 14316; AVX512-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 14317; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14318; AVX512-NEXT: vmovdqa64 %zmm13, %zmm21 14319; AVX512-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 14320; AVX512-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 14321; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14322; AVX512-NEXT: vmovdqa64 %zmm14, %zmm21 14323; AVX512-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 14324; AVX512-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 14325; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14326; AVX512-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 14327; AVX512-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 14328; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 14329; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 14330; AVX512-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 14331; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 14332; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 14333; AVX512-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 14334; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14335; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 14336; AVX512-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 14337; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14338; AVX512-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 14339; AVX512-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 14340; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14341; AVX512-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 14342; AVX512-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 14343; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14344; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 14345; AVX512-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 14346; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14347; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 14348; AVX512-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 14349; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14350; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 14351; AVX512-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 14352; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14353; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 14354; AVX512-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 14355; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14356; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 14357; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 14358; AVX512-NEXT: vmovdqa64 %zmm16, %zmm2 14359; AVX512-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 14360; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14361; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 14362; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 14363; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 14364; AVX512-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 14365; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14366; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 14367; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 14368; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 14369; AVX512-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 14370; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14371; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 14372; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 14373; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 14374; AVX512-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 14375; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14376; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 14377; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 14378; AVX512-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 14379; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14380; AVX512-NEXT: vmovdqa64 %zmm18, %zmm4 14381; AVX512-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 14382; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14383; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 14384; AVX512-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 14385; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14386; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 14387; AVX512-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 14388; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14389; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 14390; AVX512-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 14391; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14392; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 14393; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14394; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 14395; AVX512-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 14396; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 14397; AVX512-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 14398; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 14399; AVX512-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 14400; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14401; AVX512-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 14402; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 14403; AVX512-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 14404; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14405; AVX512-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 14406; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 14407; AVX512-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 14408; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14409; AVX512-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 14410; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 14411; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14412; AVX512-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 14413; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14414; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm0 14415; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm17 14416; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] 14417; AVX512-NEXT: vmovdqa64 %zmm17, %zmm22 14418; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 14419; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] 14420; AVX512-NEXT: vmovdqa64 %zmm17, %zmm23 14421; AVX512-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 14422; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] 14423; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 14424; AVX512-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 14425; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] 14426; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 14427; AVX512-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 14428; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] 14429; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 14430; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 14431; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14432; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] 14433; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 14434; AVX512-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 14435; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14436; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] 14437; AVX512-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 14438; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5 14439; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 14440; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 14441; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 14442; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 14443; AVX512-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 14444; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 14445; AVX512-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 14446; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 14447; AVX512-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 14448; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 14449; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 14450; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14451; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 14452; AVX512-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 14453; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14454; AVX512-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 14455; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm9 14456; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm6 14457; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 14458; AVX512-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 14459; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm0 14460; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm1 14461; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 14462; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10 14463; AVX512-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 14464; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 14465; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 14466; AVX512-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 14467; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 14468; AVX512-NEXT: vmovdqa64 %zmm9, %zmm12 14469; AVX512-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 14470; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 14471; AVX512-NEXT: vmovdqa64 %zmm6, %zmm21 14472; AVX512-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 14473; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 14474; AVX512-NEXT: vmovdqa64 %zmm6, %zmm26 14475; AVX512-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 14476; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 14477; AVX512-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 14478; AVX512-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 14479; AVX512-NEXT: movw $992, %ax # imm = 0x3E0 14480; AVX512-NEXT: kmovw %eax, %k1 14481; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14482; AVX512-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} 14483; AVX512-NEXT: movb $-32, %al 14484; AVX512-NEXT: kmovw %eax, %k2 14485; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14486; AVX512-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} 14487; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14488; AVX512-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} 14489; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14490; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} 14491; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload 14492; AVX512-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} 14493; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14494; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} 14495; AVX512-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} 14496; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14497; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} 14498; AVX512-NEXT: movw $480, %ax # imm = 0x1E0 14499; AVX512-NEXT: kmovw %eax, %k2 14500; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14501; AVX512-NEXT: vmovdqa32 %zmm0, %zmm10 {%k2} 14502; AVX512-NEXT: movw $-512, %ax # imm = 0xFE00 14503; AVX512-NEXT: kmovw %eax, %k1 14504; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14505; AVX512-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} 14506; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14507; AVX512-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} 14508; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14509; AVX512-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} 14510; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14511; AVX512-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} 14512; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14513; AVX512-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} 14514; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14515; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} 14516; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14517; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 14518; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14519; AVX512-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} 14520; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14521; AVX512-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} 14522; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14523; AVX512-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} 14524; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14525; AVX512-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} 14526; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14527; AVX512-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} 14528; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14529; AVX512-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} 14530; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14531; AVX512-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} 14532; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14533; AVX512-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} 14534; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14535; AVX512-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} 14536; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14537; AVX512-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} 14538; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14539; AVX512-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} 14540; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14541; AVX512-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} 14542; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14543; AVX512-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} 14544; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14545; AVX512-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} 14546; AVX512-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} 14547; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14548; AVX512-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 14549; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14550; AVX512-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 14551; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 14552; AVX512-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} 14553; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 14554; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload 14555; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 14556; AVX512-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} 14557; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 14558; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload 14559; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 14560; AVX512-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} 14561; AVX512-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 14562; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 14563; AVX512-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} 14564; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 14565; AVX512-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 14566; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload 14567; AVX512-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} 14568; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload 14569; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload 14570; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 14571; AVX512-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} 14572; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 14573; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload 14574; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 14575; AVX512-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} 14576; AVX512-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 14577; AVX512-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} 14578; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 14579; AVX512-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 14580; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 14581; AVX512-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} 14582; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 14583; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 14584; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 14585; AVX512-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} 14586; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 14587; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 14588; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 14589; AVX512-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} 14590; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 14591; AVX512-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 14592; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 14593; AVX512-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} 14594; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rsi) 14595; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rsi) 14596; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rsi) 14597; AVX512-NEXT: vmovdqa64 %zmm13, (%rsi) 14598; AVX512-NEXT: vmovdqa64 %zmm3, 192(%rdx) 14599; AVX512-NEXT: vmovdqa64 %zmm14, (%rdx) 14600; AVX512-NEXT: vmovdqa64 %zmm23, 64(%rdx) 14601; AVX512-NEXT: vmovdqa64 %zmm10, 128(%rdx) 14602; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rcx) 14603; AVX512-NEXT: vmovdqa64 %zmm15, (%rcx) 14604; AVX512-NEXT: vmovdqa64 %zmm24, 64(%rcx) 14605; AVX512-NEXT: vmovdqa64 %zmm11, 128(%rcx) 14606; AVX512-NEXT: vmovdqa64 %zmm7, 192(%r8) 14607; AVX512-NEXT: vmovdqa64 %zmm16, (%r8) 14608; AVX512-NEXT: vmovdqa64 %zmm29, 64(%r8) 14609; AVX512-NEXT: vmovdqa64 %zmm12, 128(%r8) 14610; AVX512-NEXT: vmovdqa64 %zmm18, 192(%r9) 14611; AVX512-NEXT: vmovdqa64 %zmm20, (%r9) 14612; AVX512-NEXT: vmovdqa64 %zmm9, 64(%r9) 14613; AVX512-NEXT: vmovdqa64 %zmm0, 128(%r9) 14614; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 14615; AVX512-NEXT: vmovdqa64 %zmm19, 192(%rax) 14616; AVX512-NEXT: vmovdqa64 %zmm27, (%rax) 14617; AVX512-NEXT: vmovdqa64 %zmm26, 64(%rax) 14618; AVX512-NEXT: vmovdqa64 %zmm25, 128(%rax) 14619; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 14620; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rax) 14621; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) 14622; AVX512-NEXT: vmovdqa64 %zmm5, (%rax) 14623; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rax) 14624; AVX512-NEXT: addq $3400, %rsp # imm = 0xD48 14625; AVX512-NEXT: vzeroupper 14626; AVX512-NEXT: retq 14627; 14628; AVX512-FCP-LABEL: load_i32_stride7_vf64: 14629; AVX512-FCP: # %bb.0: 14630; AVX512-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 14631; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 14632; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 14633; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm11 14634; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm7 14635; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm5 14636; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm12 14637; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 14638; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm8 14639; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm13 14640; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 14641; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 14642; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 14643; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 14644; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 14645; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 14646; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 14647; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 14648; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 14649; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 14650; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14651; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 14652; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 14653; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 14654; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14655; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 14656; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 14657; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 14658; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14659; AVX512-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 14660; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 14661; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14662; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 14663; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 14664; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 14665; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 14666; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 14667; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 14668; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 14669; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14670; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 14671; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 14672; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 14673; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14674; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 14675; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 14676; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 14677; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14678; AVX512-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 14679; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 14680; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14681; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 14682; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 14683; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 14684; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 14685; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 14686; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 14687; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 14688; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14689; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 14690; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 14691; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 14692; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14693; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 14694; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 14695; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 14696; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14697; AVX512-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 14698; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 14699; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14700; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 14701; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 14702; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 14703; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 14704; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 14705; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 14706; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 14707; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14708; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 14709; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 14710; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 14711; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14712; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 14713; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 14714; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 14715; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14716; AVX512-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 14717; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 14718; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14719; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 14720; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 14721; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 14722; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 14723; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 14724; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 14725; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 14726; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14727; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 14728; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 14729; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 14730; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14731; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 14732; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 14733; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 14734; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14735; AVX512-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 14736; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 14737; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14738; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 14739; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm15 14740; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 14741; AVX512-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 14742; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 14743; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 14744; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14745; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 14746; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 14747; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 14748; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 14749; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14750; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 14751; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 14752; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 14753; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 14754; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14755; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 14756; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm19 14757; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 14758; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 14759; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14760; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 14761; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 14762; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 14763; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 14764; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 14765; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14766; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 14767; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 14768; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 14769; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14770; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 14771; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 14772; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 14773; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14774; AVX512-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 14775; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 14776; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 14777; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 14778; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 14779; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 14780; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 14781; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 14782; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14783; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 14784; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 14785; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14786; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 14787; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 14788; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14789; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 14790; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 14791; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14792; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 14793; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 14794; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14795; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 14796; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 14797; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14798; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 14799; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 14800; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14801; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 14802; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 14803; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14804; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 14805; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 14806; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 14807; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 14808; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14809; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 14810; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 14811; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 14812; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 14813; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14814; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 14815; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 14816; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 14817; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 14818; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14819; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 14820; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 14821; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 14822; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 14823; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14824; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 14825; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 14826; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 14827; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14828; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 14829; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 14830; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14831; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 14832; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 14833; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14834; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 14835; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 14836; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14837; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 14838; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 14839; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14840; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 14841; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14842; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 14843; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 14844; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 14845; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 14846; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 14847; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 14848; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14849; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 14850; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 14851; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 14852; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14853; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 14854; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 14855; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 14856; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14857; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 14858; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 14859; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14860; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 14861; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14862; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 14863; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 14864; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] 14865; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm22 14866; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 14867; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] 14868; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 14869; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 14870; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] 14871; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 14872; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 14873; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] 14874; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 14875; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 14876; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] 14877; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 14878; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 14879; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14880; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] 14881; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 14882; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 14883; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14884; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] 14885; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 14886; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 14887; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 14888; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 14889; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 14890; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 14891; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 14892; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 14893; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 14894; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 14895; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 14896; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 14897; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 14898; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14899; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 14900; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 14901; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 14902; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 14903; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm9 14904; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 14905; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 14906; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 14907; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 14908; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm1 14909; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 14910; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 14911; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 14912; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 14913; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 14914; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 14915; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 14916; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 14917; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 14918; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 14919; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 14920; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 14921; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 14922; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 14923; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 14924; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 14925; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 14926; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 14927; AVX512-FCP-NEXT: movw $992, %ax # imm = 0x3E0 14928; AVX512-FCP-NEXT: kmovw %eax, %k1 14929; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14930; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} 14931; AVX512-FCP-NEXT: movb $-32, %al 14932; AVX512-FCP-NEXT: kmovw %eax, %k2 14933; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14934; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} 14935; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14936; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} 14937; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14938; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} 14939; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload 14940; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} 14941; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14942; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} 14943; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} 14944; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14945; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} 14946; AVX512-FCP-NEXT: movw $480, %ax # imm = 0x1E0 14947; AVX512-FCP-NEXT: kmovw %eax, %k2 14948; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14949; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k2} 14950; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 14951; AVX512-FCP-NEXT: kmovw %eax, %k1 14952; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14953; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} 14954; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14955; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} 14956; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14957; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} 14958; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14959; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} 14960; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14961; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} 14962; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14963; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} 14964; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14965; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 14966; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14967; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} 14968; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14969; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} 14970; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14971; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} 14972; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14973; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} 14974; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14975; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} 14976; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14977; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} 14978; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14979; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} 14980; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14981; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} 14982; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14983; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} 14984; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14985; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} 14986; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14987; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} 14988; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14989; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} 14990; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14991; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} 14992; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14993; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} 14994; AVX512-FCP-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} 14995; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14996; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 14997; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14998; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 14999; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 15000; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} 15001; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 15002; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload 15003; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 15004; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} 15005; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 15006; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload 15007; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 15008; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} 15009; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 15010; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 15011; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} 15012; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 15013; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 15014; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload 15015; AVX512-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} 15016; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload 15017; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload 15018; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 15019; AVX512-FCP-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} 15020; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 15021; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload 15022; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15023; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} 15024; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 15025; AVX512-FCP-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} 15026; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15027; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 15028; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15029; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} 15030; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15031; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 15032; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15033; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} 15034; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15035; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 15036; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15037; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} 15038; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15039; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 15040; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15041; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} 15042; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%rsi) 15043; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) 15044; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%rsi) 15045; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%rsi) 15046; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 192(%rdx) 15047; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) 15048; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 64(%rdx) 15049; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 128(%rdx) 15050; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%rcx) 15051; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) 15052; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 64(%rcx) 15053; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 128(%rcx) 15054; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) 15055; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%r8) 15056; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) 15057; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 128(%r8) 15058; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 192(%r9) 15059; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%r9) 15060; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) 15061; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%r9) 15062; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 15063; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 192(%rax) 15064; AVX512-FCP-NEXT: vmovdqa64 %zmm27, (%rax) 15065; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 64(%rax) 15066; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 128(%rax) 15067; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 15068; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) 15069; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) 15070; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) 15071; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) 15072; AVX512-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 15073; AVX512-FCP-NEXT: vzeroupper 15074; AVX512-FCP-NEXT: retq 15075; 15076; AVX512DQ-LABEL: load_i32_stride7_vf64: 15077; AVX512DQ: # %bb.0: 15078; AVX512DQ-NEXT: subq $3400, %rsp # imm = 0xD48 15079; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm2 15080; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm17 15081; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm11 15082; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm7 15083; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm5 15084; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm12 15085; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm6 15086; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm8 15087; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm13 15088; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm20 15089; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm4 15090; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm14 15091; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 15092; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 15093; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3 15094; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 15095; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 15096; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 15097; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 15098; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15099; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 15100; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 15101; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 15102; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15103; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 15104; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 15105; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 15106; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15107; AVX512DQ-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 15108; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 15109; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15110; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 15111; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 15112; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 15113; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 15114; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 15115; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 15116; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 15117; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15118; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 15119; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 15120; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 15121; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15122; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3 15123; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 15124; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 15125; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15126; AVX512DQ-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 15127; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 15128; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15129; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 15130; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 15131; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 15132; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 15133; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 15134; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 15135; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 15136; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15137; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 15138; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 15139; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 15140; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15141; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3 15142; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 15143; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 15144; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15145; AVX512DQ-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 15146; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 15147; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15148; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 15149; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 15150; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 15151; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 15152; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 15153; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 15154; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 15155; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15156; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 15157; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 15158; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 15159; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15160; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 15161; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 15162; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 15163; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15164; AVX512DQ-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 15165; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 15166; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15167; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 15168; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 15169; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 15170; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 15171; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 15172; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 15173; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 15174; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15175; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 15176; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 15177; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 15178; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15179; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 15180; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 15181; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 15182; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15183; AVX512DQ-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 15184; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 15185; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15186; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm3 15187; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm15 15188; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 15189; AVX512DQ-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 15190; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 15191; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 15192; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15193; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm9 15194; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm16 15195; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 15196; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 15197; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15198; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm0 15199; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm18 15200; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 15201; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 15202; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15203; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm1 15204; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm19 15205; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 15206; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 15207; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15208; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm21 15209; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 15210; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 15211; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 15212; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 15213; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15214; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm21 15215; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 15216; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 15217; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15218; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm21 15219; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 15220; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 15221; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15222; AVX512DQ-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 15223; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 15224; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 15225; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 15226; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 15227; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 15228; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 15229; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 15230; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15231; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 15232; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 15233; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15234; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 15235; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 15236; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15237; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 15238; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 15239; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15240; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 15241; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 15242; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15243; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 15244; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 15245; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15246; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 15247; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 15248; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15249; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 15250; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 15251; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15252; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 15253; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 15254; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm2 15255; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 15256; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15257; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 15258; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 15259; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 15260; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 15261; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15262; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 15263; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 15264; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 15265; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 15266; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15267; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 15268; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 15269; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 15270; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 15271; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15272; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 15273; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 15274; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 15275; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15276; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm4 15277; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 15278; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15279; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 15280; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 15281; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15282; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 15283; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 15284; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15285; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 15286; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 15287; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15288; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 15289; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15290; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 15291; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 15292; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 15293; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 15294; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 15295; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 15296; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15297; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 15298; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 15299; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 15300; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15301; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 15302; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 15303; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 15304; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15305; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 15306; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 15307; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15308; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 15309; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15310; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm0 15311; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm17 15312; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] 15313; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm22 15314; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 15315; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] 15316; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm23 15317; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 15318; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] 15319; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 15320; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 15321; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] 15322; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 15323; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 15324; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] 15325; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 15326; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 15327; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15328; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] 15329; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 15330; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 15331; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15332; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] 15333; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 15334; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm5 15335; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm0 15336; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 15337; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 15338; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14 15339; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 15340; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 15341; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 15342; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 15343; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 15344; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 15345; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 15346; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15347; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 15348; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 15349; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15350; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 15351; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm9 15352; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm6 15353; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 15354; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 15355; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm0 15356; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm1 15357; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 15358; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10 15359; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 15360; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 15361; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 15362; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 15363; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 15364; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm12 15365; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 15366; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 15367; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm21 15368; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 15369; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 15370; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm26 15371; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 15372; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 15373; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 15374; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 15375; AVX512DQ-NEXT: movw $992, %ax # imm = 0x3E0 15376; AVX512DQ-NEXT: kmovw %eax, %k1 15377; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15378; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} 15379; AVX512DQ-NEXT: movb $-32, %al 15380; AVX512DQ-NEXT: kmovw %eax, %k2 15381; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15382; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} 15383; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15384; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} 15385; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15386; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} 15387; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload 15388; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} 15389; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15390; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} 15391; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} 15392; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15393; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} 15394; AVX512DQ-NEXT: movw $480, %ax # imm = 0x1E0 15395; AVX512DQ-NEXT: kmovw %eax, %k2 15396; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15397; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm10 {%k2} 15398; AVX512DQ-NEXT: movw $-512, %ax # imm = 0xFE00 15399; AVX512DQ-NEXT: kmovw %eax, %k1 15400; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15401; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} 15402; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15403; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} 15404; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15405; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} 15406; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15407; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} 15408; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15409; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} 15410; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15411; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} 15412; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15413; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 15414; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15415; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} 15416; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15417; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} 15418; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15419; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} 15420; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15421; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} 15422; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15423; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} 15424; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15425; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} 15426; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15427; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} 15428; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15429; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} 15430; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15431; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} 15432; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15433; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} 15434; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15435; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} 15436; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15437; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} 15438; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15439; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} 15440; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15441; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} 15442; AVX512DQ-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} 15443; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15444; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 15445; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15446; AVX512DQ-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 15447; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 15448; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} 15449; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 15450; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload 15451; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 15452; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} 15453; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 15454; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload 15455; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 15456; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} 15457; AVX512DQ-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 15458; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 15459; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} 15460; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 15461; AVX512DQ-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 15462; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload 15463; AVX512DQ-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} 15464; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload 15465; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload 15466; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 15467; AVX512DQ-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} 15468; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 15469; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload 15470; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15471; AVX512DQ-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} 15472; AVX512DQ-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 15473; AVX512DQ-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} 15474; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15475; AVX512DQ-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 15476; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15477; AVX512DQ-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} 15478; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15479; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 15480; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15481; AVX512DQ-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} 15482; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15483; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 15484; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15485; AVX512DQ-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} 15486; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15487; AVX512DQ-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 15488; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15489; AVX512DQ-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} 15490; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rsi) 15491; AVX512DQ-NEXT: vmovdqa64 %zmm8, 128(%rsi) 15492; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%rsi) 15493; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rsi) 15494; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) 15495; AVX512DQ-NEXT: vmovdqa64 %zmm14, (%rdx) 15496; AVX512DQ-NEXT: vmovdqa64 %zmm23, 64(%rdx) 15497; AVX512DQ-NEXT: vmovdqa64 %zmm10, 128(%rdx) 15498; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%rcx) 15499; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rcx) 15500; AVX512DQ-NEXT: vmovdqa64 %zmm24, 64(%rcx) 15501; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rcx) 15502; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%r8) 15503; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%r8) 15504; AVX512DQ-NEXT: vmovdqa64 %zmm29, 64(%r8) 15505; AVX512DQ-NEXT: vmovdqa64 %zmm12, 128(%r8) 15506; AVX512DQ-NEXT: vmovdqa64 %zmm18, 192(%r9) 15507; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%r9) 15508; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%r9) 15509; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%r9) 15510; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 15511; AVX512DQ-NEXT: vmovdqa64 %zmm19, 192(%rax) 15512; AVX512DQ-NEXT: vmovdqa64 %zmm27, (%rax) 15513; AVX512DQ-NEXT: vmovdqa64 %zmm26, 64(%rax) 15514; AVX512DQ-NEXT: vmovdqa64 %zmm25, 128(%rax) 15515; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 15516; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rax) 15517; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) 15518; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rax) 15519; AVX512DQ-NEXT: vmovdqa64 %zmm17, 64(%rax) 15520; AVX512DQ-NEXT: addq $3400, %rsp # imm = 0xD48 15521; AVX512DQ-NEXT: vzeroupper 15522; AVX512DQ-NEXT: retq 15523; 15524; AVX512DQ-FCP-LABEL: load_i32_stride7_vf64: 15525; AVX512DQ-FCP: # %bb.0: 15526; AVX512DQ-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 15527; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 15528; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 15529; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm11 15530; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm7 15531; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm5 15532; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm12 15533; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 15534; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm8 15535; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm13 15536; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 15537; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 15538; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 15539; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 15540; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 15541; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 15542; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 15543; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 15544; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 15545; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 15546; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15547; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 15548; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 15549; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 15550; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15551; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 15552; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 15553; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 15554; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15555; AVX512DQ-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 15556; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 15557; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15558; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 15559; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 15560; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 15561; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 15562; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 15563; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 15564; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 15565; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15566; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 15567; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 15568; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 15569; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15570; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 15571; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 15572; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 15573; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15574; AVX512DQ-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 15575; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 15576; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15577; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 15578; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 15579; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 15580; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 15581; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 15582; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 15583; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 15584; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15585; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 15586; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 15587; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 15588; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15589; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 15590; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 15591; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 15592; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15593; AVX512DQ-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 15594; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 15595; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15596; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 15597; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 15598; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 15599; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 15600; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 15601; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 15602; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 15603; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15604; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 15605; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 15606; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 15607; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15608; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 15609; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 15610; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 15611; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15612; AVX512DQ-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 15613; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 15614; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15615; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 15616; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 15617; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 15618; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 15619; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 15620; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 15621; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 15622; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15623; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 15624; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 15625; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 15626; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15627; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 15628; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 15629; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 15630; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15631; AVX512DQ-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 15632; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 15633; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15634; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 15635; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm15 15636; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 15637; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 15638; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 15639; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 15640; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15641; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 15642; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 15643; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 15644; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 15645; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15646; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 15647; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 15648; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 15649; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 15650; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15651; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 15652; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm19 15653; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 15654; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 15655; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15656; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 15657; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 15658; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 15659; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 15660; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 15661; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15662; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 15663; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 15664; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 15665; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15666; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 15667; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 15668; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 15669; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15670; AVX512DQ-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 15671; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 15672; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 15673; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 15674; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 15675; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 15676; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 15677; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 15678; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15679; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 15680; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 15681; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15682; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 15683; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 15684; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15685; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 15686; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 15687; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15688; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 15689; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 15690; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15691; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 15692; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 15693; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15694; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 15695; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 15696; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15697; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 15698; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 15699; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15700; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 15701; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 15702; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 15703; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 15704; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15705; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 15706; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 15707; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 15708; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 15709; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15710; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 15711; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 15712; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 15713; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 15714; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15715; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 15716; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 15717; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 15718; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 15719; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15720; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 15721; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 15722; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 15723; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15724; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 15725; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 15726; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15727; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 15728; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 15729; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15730; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 15731; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 15732; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15733; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 15734; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 15735; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15736; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 15737; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15738; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 15739; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 15740; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 15741; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 15742; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 15743; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 15744; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15745; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 15746; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 15747; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 15748; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15749; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 15750; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 15751; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 15752; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15753; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 15754; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 15755; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15756; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 15757; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15758; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 15759; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 15760; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] 15761; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm22 15762; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 15763; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] 15764; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 15765; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 15766; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] 15767; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 15768; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 15769; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] 15770; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 15771; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 15772; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] 15773; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 15774; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 15775; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15776; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] 15777; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 15778; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 15779; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15780; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] 15781; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 15782; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 15783; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 15784; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 15785; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 15786; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 15787; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 15788; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 15789; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 15790; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 15791; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 15792; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 15793; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 15794; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15795; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 15796; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 15797; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15798; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 15799; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm9 15800; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 15801; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 15802; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 15803; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 15804; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm1 15805; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 15806; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 15807; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 15808; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 15809; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 15810; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 15811; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 15812; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 15813; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 15814; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 15815; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 15816; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 15817; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 15818; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 15819; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 15820; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 15821; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 15822; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 15823; AVX512DQ-FCP-NEXT: movw $992, %ax # imm = 0x3E0 15824; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 15825; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15826; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} 15827; AVX512DQ-FCP-NEXT: movb $-32, %al 15828; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 15829; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15830; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} 15831; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15832; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} 15833; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15834; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} 15835; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload 15836; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} 15837; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15838; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} 15839; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} 15840; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15841; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} 15842; AVX512DQ-FCP-NEXT: movw $480, %ax # imm = 0x1E0 15843; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 15844; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15845; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k2} 15846; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 15847; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 15848; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15849; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} 15850; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15851; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} 15852; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15853; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} 15854; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15855; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} 15856; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15857; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} 15858; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15859; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} 15860; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15861; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 15862; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15863; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} 15864; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15865; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} 15866; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15867; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} 15868; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15869; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} 15870; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15871; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} 15872; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15873; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} 15874; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15875; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} 15876; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15877; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} 15878; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15879; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} 15880; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15881; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} 15882; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15883; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} 15884; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15885; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} 15886; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15887; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} 15888; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15889; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} 15890; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} 15891; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15892; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 15893; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 15894; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 15895; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 15896; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} 15897; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 15898; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload 15899; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 15900; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} 15901; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 15902; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload 15903; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 15904; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} 15905; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 15906; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 15907; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} 15908; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 15909; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 15910; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload 15911; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} 15912; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload 15913; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload 15914; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 15915; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} 15916; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 15917; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload 15918; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15919; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} 15920; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 15921; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} 15922; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15923; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 15924; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15925; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} 15926; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15927; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 15928; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15929; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} 15930; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15931; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 15932; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15933; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} 15934; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15935; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 15936; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 15937; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} 15938; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rsi) 15939; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) 15940; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%rsi) 15941; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%rsi) 15942; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 192(%rdx) 15943; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) 15944; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 64(%rdx) 15945; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 128(%rdx) 15946; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%rcx) 15947; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) 15948; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 64(%rcx) 15949; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 128(%rcx) 15950; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) 15951; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%r8) 15952; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) 15953; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 128(%r8) 15954; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 192(%r9) 15955; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%r9) 15956; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) 15957; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%r9) 15958; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 15959; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 192(%rax) 15960; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, (%rax) 15961; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%rax) 15962; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 128(%rax) 15963; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 15964; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) 15965; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) 15966; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax) 15967; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) 15968; AVX512DQ-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 15969; AVX512DQ-FCP-NEXT: vzeroupper 15970; AVX512DQ-FCP-NEXT: retq 15971; 15972; AVX512BW-LABEL: load_i32_stride7_vf64: 15973; AVX512BW: # %bb.0: 15974; AVX512BW-NEXT: subq $3400, %rsp # imm = 0xD48 15975; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 15976; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm17 15977; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm11 15978; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm7 15979; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 15980; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm12 15981; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 15982; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 15983; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm13 15984; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm20 15985; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 15986; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm14 15987; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 15988; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 15989; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 15990; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 15991; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 15992; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 15993; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 15994; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15995; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 15996; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 15997; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 15998; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 15999; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 16000; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 16001; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 16002; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16003; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 16004; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 16005; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16006; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 16007; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 16008; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 16009; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 16010; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 16011; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 16012; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 16013; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16014; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 16015; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 16016; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 16017; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16018; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 16019; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 16020; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 16021; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16022; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 16023; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 16024; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16025; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 16026; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 16027; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 16028; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 16029; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 16030; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 16031; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 16032; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16033; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 16034; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 16035; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 16036; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16037; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 16038; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 16039; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 16040; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16041; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 16042; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 16043; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16044; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 16045; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 16046; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 16047; AVX512BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 16048; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 16049; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 16050; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 16051; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16052; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 16053; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 16054; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 16055; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16056; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 16057; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 16058; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 16059; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16060; AVX512BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 16061; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 16062; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16063; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 16064; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 16065; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 16066; AVX512BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 16067; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 16068; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 16069; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 16070; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16071; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 16072; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 16073; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 16074; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16075; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 16076; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 16077; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 16078; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16079; AVX512BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 16080; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 16081; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16082; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 16083; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm15 16084; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 16085; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16086; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 16087; AVX512BW-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 16088; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16089; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 16090; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 16091; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 16092; AVX512BW-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 16093; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16094; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 16095; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm18 16096; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 16097; AVX512BW-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 16098; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16099; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 16100; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm19 16101; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 16102; AVX512BW-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 16103; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16104; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 16105; AVX512BW-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 16106; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 16107; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 16108; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 16109; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16110; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm21 16111; AVX512BW-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 16112; AVX512BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 16113; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16114; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm21 16115; AVX512BW-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 16116; AVX512BW-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 16117; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16118; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 16119; AVX512BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 16120; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 16121; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16122; AVX512BW-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 16123; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 16124; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 16125; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 16126; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16127; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 16128; AVX512BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 16129; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16130; AVX512BW-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 16131; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 16132; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16133; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 16134; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 16135; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16136; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 16137; AVX512BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 16138; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16139; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 16140; AVX512BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 16141; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16142; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 16143; AVX512BW-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 16144; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16145; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 16146; AVX512BW-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 16147; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16148; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 16149; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 16150; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 16151; AVX512BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 16152; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16153; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 16154; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16155; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 16156; AVX512BW-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 16157; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16158; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 16159; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 16160; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 16161; AVX512BW-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 16162; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16163; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 16164; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 16165; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 16166; AVX512BW-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 16167; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16168; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 16169; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 16170; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 16171; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16172; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 16173; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 16174; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16175; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 16176; AVX512BW-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 16177; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16178; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 16179; AVX512BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 16180; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16181; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 16182; AVX512BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 16183; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16184; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 16185; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16186; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 16187; AVX512BW-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 16188; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 16189; AVX512BW-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 16190; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 16191; AVX512BW-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 16192; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16193; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 16194; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 16195; AVX512BW-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 16196; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16197; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 16198; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 16199; AVX512BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 16200; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16201; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 16202; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 16203; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16204; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 16205; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16206; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 16207; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm17 16208; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] 16209; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm22 16210; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 16211; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] 16212; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 16213; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 16214; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] 16215; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 16216; AVX512BW-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 16217; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] 16218; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 16219; AVX512BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 16220; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] 16221; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 16222; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 16223; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16224; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] 16225; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 16226; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 16227; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16228; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] 16229; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 16230; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 16231; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 16232; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 16233; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 16234; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 16235; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 16236; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 16237; AVX512BW-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 16238; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 16239; AVX512BW-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 16240; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 16241; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 16242; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16243; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 16244; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 16245; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16246; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 16247; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm9 16248; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 16249; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 16250; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 16251; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 16252; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 16253; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 16254; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 16255; AVX512BW-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 16256; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 16257; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 16258; AVX512BW-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 16259; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 16260; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 16261; AVX512BW-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 16262; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 16263; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 16264; AVX512BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 16265; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 16266; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm26 16267; AVX512BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 16268; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 16269; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 16270; AVX512BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 16271; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0 16272; AVX512BW-NEXT: kmovd %eax, %k1 16273; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16274; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} 16275; AVX512BW-NEXT: movb $-32, %al 16276; AVX512BW-NEXT: kmovd %eax, %k2 16277; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16278; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} 16279; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16280; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} 16281; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16282; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} 16283; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload 16284; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} 16285; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16286; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} 16287; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} 16288; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16289; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} 16290; AVX512BW-NEXT: movw $480, %ax # imm = 0x1E0 16291; AVX512BW-NEXT: kmovd %eax, %k2 16292; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16293; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k2} 16294; AVX512BW-NEXT: movw $-512, %ax # imm = 0xFE00 16295; AVX512BW-NEXT: kmovd %eax, %k1 16296; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16297; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} 16298; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16299; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} 16300; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16301; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} 16302; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16303; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} 16304; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16305; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} 16306; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16307; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} 16308; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16309; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 16310; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16311; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} 16312; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16313; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} 16314; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16315; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} 16316; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16317; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} 16318; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16319; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} 16320; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16321; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} 16322; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16323; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} 16324; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16325; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} 16326; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16327; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} 16328; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16329; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} 16330; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16331; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} 16332; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16333; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} 16334; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16335; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} 16336; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16337; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} 16338; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} 16339; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16340; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 16341; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16342; AVX512BW-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 16343; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 16344; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} 16345; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 16346; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload 16347; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 16348; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} 16349; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 16350; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload 16351; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 16352; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} 16353; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 16354; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 16355; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} 16356; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 16357; AVX512BW-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 16358; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload 16359; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} 16360; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload 16361; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload 16362; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 16363; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} 16364; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 16365; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload 16366; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16367; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} 16368; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 16369; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} 16370; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16371; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 16372; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16373; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} 16374; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16375; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 16376; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16377; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} 16378; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16379; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 16380; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16381; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} 16382; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16383; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 16384; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16385; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} 16386; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rsi) 16387; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) 16388; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rsi) 16389; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rsi) 16390; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) 16391; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) 16392; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%rdx) 16393; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) 16394; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rcx) 16395; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx) 16396; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%rcx) 16397; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rcx) 16398; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%r8) 16399; AVX512BW-NEXT: vmovdqa64 %zmm16, (%r8) 16400; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%r8) 16401; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%r8) 16402; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%r9) 16403; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r9) 16404; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%r9) 16405; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r9) 16406; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 16407; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rax) 16408; AVX512BW-NEXT: vmovdqa64 %zmm27, (%rax) 16409; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%rax) 16410; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rax) 16411; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 16412; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) 16413; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) 16414; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) 16415; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) 16416; AVX512BW-NEXT: addq $3400, %rsp # imm = 0xD48 16417; AVX512BW-NEXT: vzeroupper 16418; AVX512BW-NEXT: retq 16419; 16420; AVX512BW-FCP-LABEL: load_i32_stride7_vf64: 16421; AVX512BW-FCP: # %bb.0: 16422; AVX512BW-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 16423; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 16424; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 16425; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm11 16426; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm7 16427; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm5 16428; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm12 16429; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 16430; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm8 16431; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm13 16432; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 16433; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 16434; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 16435; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 16436; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16437; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 16438; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 16439; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 16440; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 16441; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 16442; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16443; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 16444; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 16445; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 16446; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16447; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 16448; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 16449; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 16450; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16451; AVX512BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 16452; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 16453; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16454; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 16455; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 16456; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 16457; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 16458; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 16459; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 16460; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 16461; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16462; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 16463; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 16464; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 16465; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16466; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 16467; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 16468; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 16469; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16470; AVX512BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 16471; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 16472; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16473; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 16474; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 16475; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 16476; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 16477; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 16478; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 16479; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 16480; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16481; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 16482; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 16483; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 16484; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16485; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 16486; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 16487; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 16488; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16489; AVX512BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 16490; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 16491; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16492; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 16493; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 16494; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 16495; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 16496; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 16497; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 16498; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 16499; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16500; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 16501; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 16502; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 16503; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16504; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 16505; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 16506; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 16507; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16508; AVX512BW-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 16509; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 16510; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16511; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 16512; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 16513; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 16514; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 16515; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 16516; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 16517; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 16518; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16519; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 16520; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 16521; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 16522; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16523; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 16524; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 16525; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 16526; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16527; AVX512BW-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 16528; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 16529; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16530; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 16531; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm15 16532; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 16533; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16534; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 16535; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 16536; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16537; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 16538; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 16539; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 16540; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 16541; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16542; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 16543; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 16544; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 16545; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 16546; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16547; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 16548; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm19 16549; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 16550; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 16551; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16552; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 16553; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 16554; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 16555; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 16556; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 16557; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16558; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 16559; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 16560; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 16561; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16562; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 16563; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 16564; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 16565; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16566; AVX512BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 16567; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 16568; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 16569; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16570; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 16571; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 16572; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 16573; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 16574; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16575; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 16576; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 16577; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16578; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 16579; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 16580; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16581; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 16582; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 16583; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16584; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 16585; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 16586; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16587; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 16588; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 16589; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16590; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 16591; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 16592; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16593; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 16594; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 16595; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16596; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 16597; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 16598; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 16599; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 16600; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16601; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 16602; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16603; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 16604; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 16605; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16606; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 16607; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 16608; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 16609; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 16610; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16611; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 16612; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 16613; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 16614; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 16615; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16616; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 16617; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 16618; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 16619; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16620; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 16621; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 16622; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16623; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 16624; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 16625; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16626; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 16627; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 16628; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16629; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 16630; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 16631; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16632; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 16633; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16634; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 16635; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 16636; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 16637; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 16638; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 16639; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 16640; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16641; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 16642; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 16643; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 16644; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16645; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 16646; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 16647; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 16648; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16649; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 16650; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 16651; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16652; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 16653; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16654; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 16655; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 16656; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] 16657; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm22 16658; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 16659; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] 16660; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 16661; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 16662; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] 16663; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 16664; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 16665; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] 16666; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 16667; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 16668; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] 16669; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 16670; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 16671; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16672; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] 16673; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 16674; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 16675; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16676; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] 16677; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 16678; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 16679; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 16680; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 16681; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 16682; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 16683; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 16684; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 16685; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 16686; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 16687; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 16688; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 16689; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 16690; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16691; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 16692; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 16693; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16694; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 16695; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm9 16696; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 16697; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 16698; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 16699; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 16700; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm1 16701; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 16702; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 16703; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 16704; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 16705; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 16706; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 16707; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 16708; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 16709; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 16710; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 16711; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 16712; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 16713; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 16714; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 16715; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 16716; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 16717; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 16718; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 16719; AVX512BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 16720; AVX512BW-FCP-NEXT: kmovd %eax, %k1 16721; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16722; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} 16723; AVX512BW-FCP-NEXT: movb $-32, %al 16724; AVX512BW-FCP-NEXT: kmovd %eax, %k2 16725; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16726; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} 16727; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16728; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} 16729; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16730; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} 16731; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload 16732; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} 16733; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16734; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} 16735; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} 16736; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16737; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} 16738; AVX512BW-FCP-NEXT: movw $480, %ax # imm = 0x1E0 16739; AVX512BW-FCP-NEXT: kmovd %eax, %k2 16740; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16741; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k2} 16742; AVX512BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 16743; AVX512BW-FCP-NEXT: kmovd %eax, %k1 16744; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16745; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} 16746; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16747; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} 16748; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16749; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} 16750; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16751; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} 16752; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16753; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} 16754; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16755; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} 16756; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16757; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 16758; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16759; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} 16760; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16761; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} 16762; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16763; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} 16764; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16765; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} 16766; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16767; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} 16768; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16769; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} 16770; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16771; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} 16772; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16773; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} 16774; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16775; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} 16776; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16777; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} 16778; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16779; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} 16780; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16781; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} 16782; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16783; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} 16784; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16785; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} 16786; AVX512BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} 16787; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16788; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 16789; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 16790; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 16791; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 16792; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} 16793; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 16794; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload 16795; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 16796; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} 16797; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 16798; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload 16799; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 16800; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} 16801; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 16802; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 16803; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} 16804; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 16805; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 16806; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload 16807; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} 16808; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload 16809; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload 16810; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 16811; AVX512BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} 16812; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 16813; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload 16814; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16815; AVX512BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} 16816; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 16817; AVX512BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} 16818; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16819; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 16820; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16821; AVX512BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} 16822; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16823; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 16824; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16825; AVX512BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} 16826; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16827; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 16828; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16829; AVX512BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} 16830; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16831; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 16832; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 16833; AVX512BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} 16834; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rsi) 16835; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) 16836; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rsi) 16837; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%rsi) 16838; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rdx) 16839; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) 16840; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%rdx) 16841; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rdx) 16842; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rcx) 16843; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) 16844; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%rcx) 16845; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rcx) 16846; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) 16847; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%r8) 16848; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) 16849; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%r8) 16850; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%r9) 16851; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%r9) 16852; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) 16853; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%r9) 16854; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 16855; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%rax) 16856; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, (%rax) 16857; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%rax) 16858; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rax) 16859; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 16860; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) 16861; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) 16862; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) 16863; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) 16864; AVX512BW-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 16865; AVX512BW-FCP-NEXT: vzeroupper 16866; AVX512BW-FCP-NEXT: retq 16867; 16868; AVX512DQ-BW-LABEL: load_i32_stride7_vf64: 16869; AVX512DQ-BW: # %bb.0: 16870; AVX512DQ-BW-NEXT: subq $3400, %rsp # imm = 0xD48 16871; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 16872; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm17 16873; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm11 16874; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm7 16875; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 16876; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm12 16877; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm6 16878; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm8 16879; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm13 16880; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm20 16881; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm4 16882; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm14 16883; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 16884; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16885; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 16886; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 16887; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 16888; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 16889; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 16890; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16891; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 16892; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 16893; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 16894; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16895; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 16896; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 16897; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 16898; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16899; AVX512DQ-BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 16900; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 16901; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16902; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 16903; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 16904; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 16905; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 16906; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 16907; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 16908; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 16909; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16910; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 16911; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 16912; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 16913; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16914; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 16915; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 16916; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 16917; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16918; AVX512DQ-BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 16919; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 16920; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16921; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 16922; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 16923; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 16924; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 16925; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 16926; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 16927; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 16928; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16929; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 16930; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 16931; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 16932; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16933; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 16934; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 16935; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 16936; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16937; AVX512DQ-BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 16938; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 16939; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16940; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 16941; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 16942; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 16943; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 16944; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 16945; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 16946; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 16947; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16948; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 16949; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 16950; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 16951; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16952; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 16953; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 16954; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 16955; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16956; AVX512DQ-BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 16957; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 16958; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16959; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 16960; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 16961; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 16962; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 16963; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 16964; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 16965; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 16966; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16967; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 16968; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 16969; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 16970; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16971; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 16972; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 16973; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 16974; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16975; AVX512DQ-BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 16976; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 16977; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16978; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 16979; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm15 16980; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 16981; AVX512DQ-BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 16982; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 16983; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 16984; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16985; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm9 16986; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm16 16987; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 16988; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 16989; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16990; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm0 16991; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm18 16992; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 16993; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 16994; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16995; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 16996; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm19 16997; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 16998; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 16999; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17000; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm21 17001; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 17002; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 17003; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 17004; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 17005; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17006; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm21 17007; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 17008; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 17009; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17010; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm21 17011; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 17012; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 17013; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17014; AVX512DQ-BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 17015; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 17016; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 17017; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17018; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 17019; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 17020; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 17021; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 17022; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17023; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 17024; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 17025; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17026; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 17027; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 17028; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17029; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 17030; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 17031; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17032; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 17033; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 17034; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17035; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 17036; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 17037; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17038; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 17039; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 17040; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17041; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 17042; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 17043; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17044; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 17045; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 17046; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm2 17047; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 17048; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17049; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 17050; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17051; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 17052; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 17053; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17054; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 17055; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 17056; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 17057; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 17058; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17059; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 17060; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 17061; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 17062; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 17063; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17064; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 17065; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 17066; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 17067; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17068; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm4 17069; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 17070; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17071; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 17072; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 17073; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17074; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 17075; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 17076; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17077; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 17078; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 17079; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17080; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 17081; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17082; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 17083; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 17084; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 17085; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 17086; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 17087; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 17088; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17089; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 17090; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 17091; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 17092; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17093; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 17094; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 17095; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 17096; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17097; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 17098; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 17099; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17100; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 17101; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17102; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm0 17103; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm17 17104; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] 17105; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm22 17106; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 17107; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] 17108; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm23 17109; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 17110; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] 17111; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 17112; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 17113; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] 17114; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 17115; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 17116; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] 17117; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 17118; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 17119; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17120; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] 17121; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 17122; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 17123; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17124; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] 17125; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 17126; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5 17127; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm0 17128; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 17129; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 17130; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14 17131; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 17132; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 17133; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 17134; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 17135; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 17136; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 17137; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 17138; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17139; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 17140; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 17141; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17142; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 17143; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm9 17144; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm6 17145; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 17146; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 17147; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 17148; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 17149; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 17150; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 17151; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 17152; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 17153; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm11 17154; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 17155; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 17156; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm12 17157; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 17158; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 17159; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm21 17160; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 17161; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 17162; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm26 17163; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 17164; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 17165; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 17166; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 17167; AVX512DQ-BW-NEXT: movw $992, %ax # imm = 0x3E0 17168; AVX512DQ-BW-NEXT: kmovd %eax, %k1 17169; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17170; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} 17171; AVX512DQ-BW-NEXT: movb $-32, %al 17172; AVX512DQ-BW-NEXT: kmovd %eax, %k2 17173; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17174; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} 17175; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17176; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} 17177; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17178; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} 17179; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload 17180; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} 17181; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17182; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} 17183; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} 17184; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17185; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} 17186; AVX512DQ-BW-NEXT: movw $480, %ax # imm = 0x1E0 17187; AVX512DQ-BW-NEXT: kmovd %eax, %k2 17188; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17189; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k2} 17190; AVX512DQ-BW-NEXT: movw $-512, %ax # imm = 0xFE00 17191; AVX512DQ-BW-NEXT: kmovd %eax, %k1 17192; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17193; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} 17194; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17195; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} 17196; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17197; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} 17198; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17199; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} 17200; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17201; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} 17202; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17203; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} 17204; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17205; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 17206; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17207; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} 17208; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17209; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} 17210; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17211; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} 17212; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17213; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} 17214; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17215; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} 17216; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17217; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} 17218; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17219; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} 17220; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17221; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} 17222; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17223; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} 17224; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17225; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} 17226; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17227; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} 17228; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17229; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} 17230; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17231; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} 17232; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17233; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} 17234; AVX512DQ-BW-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} 17235; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17236; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 17237; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17238; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 17239; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 17240; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} 17241; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 17242; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload 17243; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 17244; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} 17245; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 17246; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload 17247; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 17248; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} 17249; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 17250; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 17251; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} 17252; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 17253; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 17254; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload 17255; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} 17256; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload 17257; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload 17258; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 17259; AVX512DQ-BW-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} 17260; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 17261; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload 17262; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17263; AVX512DQ-BW-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} 17264; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 17265; AVX512DQ-BW-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} 17266; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17267; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 17268; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17269; AVX512DQ-BW-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} 17270; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17271; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 17272; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17273; AVX512DQ-BW-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} 17274; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17275; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 17276; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17277; AVX512DQ-BW-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} 17278; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17279; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 17280; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17281; AVX512DQ-BW-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} 17282; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%rsi) 17283; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) 17284; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%rsi) 17285; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%rsi) 17286; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) 17287; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rdx) 17288; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 64(%rdx) 17289; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) 17290; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%rcx) 17291; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%rcx) 17292; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 64(%rcx) 17293; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 128(%rcx) 17294; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 192(%r8) 17295; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%r8) 17296; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 64(%r8) 17297; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 128(%r8) 17298; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 192(%r9) 17299; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%r9) 17300; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%r9) 17301; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%r9) 17302; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 17303; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 192(%rax) 17304; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, (%rax) 17305; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 64(%rax) 17306; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 128(%rax) 17307; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 17308; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rax) 17309; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%rax) 17310; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rax) 17311; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 64(%rax) 17312; AVX512DQ-BW-NEXT: addq $3400, %rsp # imm = 0xD48 17313; AVX512DQ-BW-NEXT: vzeroupper 17314; AVX512DQ-BW-NEXT: retq 17315; 17316; AVX512DQ-BW-FCP-LABEL: load_i32_stride7_vf64: 17317; AVX512DQ-BW-FCP: # %bb.0: 17318; AVX512DQ-BW-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 17319; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 17320; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 17321; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm11 17322; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm7 17323; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm5 17324; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm12 17325; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 17326; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm8 17327; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm13 17328; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 17329; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 17330; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 17331; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] 17332; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17333; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 17334; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 17335; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] 17336; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 17337; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 17338; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17339; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 17340; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 17341; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 17342; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17343; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 17344; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 17345; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 17346; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17347; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 17348; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 17349; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17350; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] 17351; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 17352; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 17353; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 17354; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] 17355; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 17356; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 17357; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17358; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 17359; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 17360; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 17361; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17362; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 17363; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 17364; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 17365; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17366; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 17367; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 17368; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17369; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] 17370; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 17371; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 17372; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 17373; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] 17374; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 17375; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 17376; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17377; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 17378; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 17379; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 17380; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17381; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 17382; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 17383; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 17384; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17385; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 17386; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 17387; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17388; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] 17389; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 17390; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 17391; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 17392; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] 17393; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 17394; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 17395; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17396; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 17397; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 17398; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 17399; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17400; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 17401; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 17402; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 17403; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17404; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 17405; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 17406; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17407; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] 17408; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 17409; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 17410; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 17411; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] 17412; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 17413; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 17414; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17415; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 17416; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 17417; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 17418; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17419; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 17420; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 17421; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 17422; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17423; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 17424; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 17425; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17426; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 17427; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm15 17428; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] 17429; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17430; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 17431; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 17432; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17433; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 17434; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 17435; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 17436; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 17437; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17438; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 17439; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 17440; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 17441; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 17442; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17443; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 17444; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm19 17445; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 17446; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 17447; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17448; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 17449; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 17450; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] 17451; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 17452; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 17453; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17454; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 17455; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 17456; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 17457; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17458; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 17459; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 17460; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 17461; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17462; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 17463; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 17464; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] 17465; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17466; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 17467; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] 17468; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 17469; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 17470; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17471; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 17472; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 17473; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17474; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 17475; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 17476; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17477; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 17478; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 17479; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17480; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 17481; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 17482; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17483; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 17484; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 17485; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17486; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 17487; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 17488; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17489; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 17490; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 17491; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17492; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] 17493; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 17494; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 17495; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 17496; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17497; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] 17498; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 17499; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 17500; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 17501; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17502; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] 17503; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] 17504; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 17505; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 17506; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17507; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] 17508; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 17509; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 17510; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 17511; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17512; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] 17513; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 17514; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 17515; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17516; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 17517; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 17518; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17519; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 17520; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 17521; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17522; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 17523; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 17524; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17525; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 17526; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 17527; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17528; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 17529; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17530; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 17531; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 17532; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 17533; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 17534; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 17535; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 17536; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17537; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 17538; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 17539; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 17540; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17541; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 17542; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 17543; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 17544; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17545; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 17546; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 17547; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17548; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 17549; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17550; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 17551; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 17552; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] 17553; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm22 17554; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 17555; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] 17556; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 17557; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 17558; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] 17559; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 17560; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 17561; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] 17562; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 17563; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 17564; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] 17565; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 17566; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 17567; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17568; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] 17569; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 17570; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 17571; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17572; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] 17573; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 17574; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 17575; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 17576; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 17577; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 17578; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 17579; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 17580; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 17581; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 17582; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 17583; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 17584; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 17585; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 17586; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17587; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 17588; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 17589; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 17590; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 17591; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm9 17592; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 17593; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 17594; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 17595; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 17596; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm1 17597; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 17598; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 17599; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 17600; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 17601; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 17602; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 17603; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 17604; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 17605; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 17606; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 17607; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 17608; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 17609; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 17610; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 17611; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 17612; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 17613; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 17614; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 17615; AVX512DQ-BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 17616; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 17617; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17618; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} 17619; AVX512DQ-BW-FCP-NEXT: movb $-32, %al 17620; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 17621; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17622; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} 17623; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17624; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} 17625; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17626; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} 17627; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload 17628; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} 17629; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17630; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} 17631; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} 17632; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17633; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} 17634; AVX512DQ-BW-FCP-NEXT: movw $480, %ax # imm = 0x1E0 17635; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 17636; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17637; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k2} 17638; AVX512DQ-BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 17639; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 17640; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17641; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} 17642; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17643; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} 17644; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17645; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} 17646; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17647; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} 17648; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17649; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} 17650; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17651; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} 17652; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17653; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 17654; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17655; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} 17656; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17657; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} 17658; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17659; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} 17660; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17661; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} 17662; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17663; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} 17664; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17665; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} 17666; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17667; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} 17668; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17669; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} 17670; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17671; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} 17672; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17673; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} 17674; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17675; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} 17676; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17677; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} 17678; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17679; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} 17680; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17681; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} 17682; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} 17683; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17684; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 17685; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 17686; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 17687; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 17688; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} 17689; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 17690; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload 17691; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 17692; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} 17693; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 17694; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload 17695; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 17696; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} 17697; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 17698; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 17699; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} 17700; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 17701; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 17702; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload 17703; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} 17704; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload 17705; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload 17706; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 17707; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} 17708; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 17709; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload 17710; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17711; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} 17712; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 17713; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} 17714; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17715; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 17716; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17717; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} 17718; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17719; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 17720; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17721; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} 17722; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17723; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 17724; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17725; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} 17726; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17727; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 17728; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 17729; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} 17730; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rsi) 17731; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) 17732; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rsi) 17733; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%rsi) 17734; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rdx) 17735; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) 17736; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%rdx) 17737; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rdx) 17738; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rcx) 17739; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) 17740; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%rcx) 17741; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rcx) 17742; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) 17743; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%r8) 17744; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) 17745; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%r8) 17746; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%r9) 17747; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%r9) 17748; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) 17749; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%r9) 17750; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 17751; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%rax) 17752; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, (%rax) 17753; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%rax) 17754; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rax) 17755; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 17756; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) 17757; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) 17758; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) 17759; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) 17760; AVX512DQ-BW-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 17761; AVX512DQ-BW-FCP-NEXT: vzeroupper 17762; AVX512DQ-BW-FCP-NEXT: retq 17763 %wide.vec = load <448 x i32>, ptr %in.vec, align 64 17764 %strided.vec0 = shufflevector <448 x i32> %wide.vec, <448 x i32> poison, <64 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217, i32 224, i32 231, i32 238, i32 245, i32 252, i32 259, i32 266, i32 273, i32 280, i32 287, i32 294, i32 301, i32 308, i32 315, i32 322, i32 329, i32 336, i32 343, i32 350, i32 357, i32 364, i32 371, i32 378, i32 385, i32 392, i32 399, i32 406, i32 413, i32 420, i32 427, i32 434, i32 441> 17765 %strided.vec1 = shufflevector <448 x i32> %wide.vec, <448 x i32> poison, <64 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218, i32 225, i32 232, i32 239, i32 246, i32 253, i32 260, i32 267, i32 274, i32 281, i32 288, i32 295, i32 302, i32 309, i32 316, i32 323, i32 330, i32 337, i32 344, i32 351, i32 358, i32 365, i32 372, i32 379, i32 386, i32 393, i32 400, i32 407, i32 414, i32 421, i32 428, i32 435, i32 442> 17766 %strided.vec2 = shufflevector <448 x i32> %wide.vec, <448 x i32> poison, <64 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219, i32 226, i32 233, i32 240, i32 247, i32 254, i32 261, i32 268, i32 275, i32 282, i32 289, i32 296, i32 303, i32 310, i32 317, i32 324, i32 331, i32 338, i32 345, i32 352, i32 359, i32 366, i32 373, i32 380, i32 387, i32 394, i32 401, i32 408, i32 415, i32 422, i32 429, i32 436, i32 443> 17767 %strided.vec3 = shufflevector <448 x i32> %wide.vec, <448 x i32> poison, <64 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220, i32 227, i32 234, i32 241, i32 248, i32 255, i32 262, i32 269, i32 276, i32 283, i32 290, i32 297, i32 304, i32 311, i32 318, i32 325, i32 332, i32 339, i32 346, i32 353, i32 360, i32 367, i32 374, i32 381, i32 388, i32 395, i32 402, i32 409, i32 416, i32 423, i32 430, i32 437, i32 444> 17768 %strided.vec4 = shufflevector <448 x i32> %wide.vec, <448 x i32> poison, <64 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221, i32 228, i32 235, i32 242, i32 249, i32 256, i32 263, i32 270, i32 277, i32 284, i32 291, i32 298, i32 305, i32 312, i32 319, i32 326, i32 333, i32 340, i32 347, i32 354, i32 361, i32 368, i32 375, i32 382, i32 389, i32 396, i32 403, i32 410, i32 417, i32 424, i32 431, i32 438, i32 445> 17769 %strided.vec5 = shufflevector <448 x i32> %wide.vec, <448 x i32> poison, <64 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222, i32 229, i32 236, i32 243, i32 250, i32 257, i32 264, i32 271, i32 278, i32 285, i32 292, i32 299, i32 306, i32 313, i32 320, i32 327, i32 334, i32 341, i32 348, i32 355, i32 362, i32 369, i32 376, i32 383, i32 390, i32 397, i32 404, i32 411, i32 418, i32 425, i32 432, i32 439, i32 446> 17770 %strided.vec6 = shufflevector <448 x i32> %wide.vec, <448 x i32> poison, <64 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223, i32 230, i32 237, i32 244, i32 251, i32 258, i32 265, i32 272, i32 279, i32 286, i32 293, i32 300, i32 307, i32 314, i32 321, i32 328, i32 335, i32 342, i32 349, i32 356, i32 363, i32 370, i32 377, i32 384, i32 391, i32 398, i32 405, i32 412, i32 419, i32 426, i32 433, i32 440, i32 447> 17771 store <64 x i32> %strided.vec0, ptr %out.vec0, align 64 17772 store <64 x i32> %strided.vec1, ptr %out.vec1, align 64 17773 store <64 x i32> %strided.vec2, ptr %out.vec2, align 64 17774 store <64 x i32> %strided.vec3, ptr %out.vec3, align 64 17775 store <64 x i32> %strided.vec4, ptr %out.vec4, align 64 17776 store <64 x i32> %strided.vec5, ptr %out.vec5, align 64 17777 store <64 x i32> %strided.vec6, ptr %out.vec6, align 64 17778 ret void 17779} 17780