1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved loads. 17 18define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 19; SSE-LABEL: load_i32_stride6_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 22; SSE-NEXT: movdqa (%rdi), %xmm1 23; SSE-NEXT: movdqa 16(%rdi), %xmm0 24; SSE-NEXT: movdqa 32(%rdi), %xmm2 25; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 26; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] 27; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] 28; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[3,3,3,3] 29; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 30; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] 31; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 32; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] 33; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] 34; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] 35; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] 36; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 37; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] 38; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 39; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] 40; SSE-NEXT: movq %xmm1, (%rsi) 41; SSE-NEXT: movq %xmm4, (%rdx) 42; SSE-NEXT: movq %xmm5, (%rcx) 43; SSE-NEXT: movq %xmm6, (%r8) 44; SSE-NEXT: movq %xmm0, (%r9) 45; SSE-NEXT: movq %xmm7, (%rax) 46; SSE-NEXT: retq 47; 48; AVX-LABEL: load_i32_stride6_vf2: 49; AVX: # %bb.0: 50; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 51; AVX-NEXT: vmovaps (%rdi), %xmm0 52; AVX-NEXT: vmovaps 16(%rdi), %xmm1 53; AVX-NEXT: vmovaps 32(%rdi), %xmm2 54; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,2,3,3] 55; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3] 56; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] 57; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] 58; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm0[2,3] 59; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] 60; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 61; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] 62; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,3,3] 63; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] 64; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 65; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3] 66; AVX-NEXT: vmovlps %xmm3, (%rsi) 67; AVX-NEXT: vmovlps %xmm4, (%rdx) 68; AVX-NEXT: vmovlps %xmm5, (%rcx) 69; AVX-NEXT: vmovlps %xmm0, (%r8) 70; AVX-NEXT: vmovlps %xmm6, (%r9) 71; AVX-NEXT: vmovlps %xmm1, (%rax) 72; AVX-NEXT: retq 73; 74; AVX2-LABEL: load_i32_stride6_vf2: 75; AVX2: # %bb.0: 76; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 77; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 78; AVX2-NEXT: vmovaps (%rdi), %xmm1 79; AVX2-NEXT: vmovaps 16(%rdi), %xmm2 80; AVX2-NEXT: vmovaps 32(%rdi), %xmm3 81; AVX2-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3] 82; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3] 83; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] 84; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] 85; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3] 86; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] 87; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 88; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] 89; AVX2-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0] 90; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 91; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm3 92; AVX2-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] 93; AVX2-NEXT: vpermps %ymm0, %ymm6, %ymm0 94; AVX2-NEXT: vmovlps %xmm4, (%rsi) 95; AVX2-NEXT: vmovlps %xmm2, (%rdx) 96; AVX2-NEXT: vmovlps %xmm5, (%rcx) 97; AVX2-NEXT: vmovlps %xmm1, (%r8) 98; AVX2-NEXT: vmovlps %xmm3, (%r9) 99; AVX2-NEXT: vmovlps %xmm0, (%rax) 100; AVX2-NEXT: vzeroupper 101; AVX2-NEXT: retq 102; 103; AVX2-FP-LABEL: load_i32_stride6_vf2: 104; AVX2-FP: # %bb.0: 105; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 106; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0 107; AVX2-FP-NEXT: vmovaps (%rdi), %xmm1 108; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm2 109; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm3 110; AVX2-FP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3] 111; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3] 112; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] 113; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] 114; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3] 115; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] 116; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 117; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] 118; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0] 119; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 120; AVX2-FP-NEXT: vpermps %ymm0, %ymm3, %ymm3 121; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] 122; AVX2-FP-NEXT: vpermps %ymm0, %ymm6, %ymm0 123; AVX2-FP-NEXT: vmovlps %xmm4, (%rsi) 124; AVX2-FP-NEXT: vmovlps %xmm2, (%rdx) 125; AVX2-FP-NEXT: vmovlps %xmm5, (%rcx) 126; AVX2-FP-NEXT: vmovlps %xmm1, (%r8) 127; AVX2-FP-NEXT: vmovlps %xmm3, (%r9) 128; AVX2-FP-NEXT: vmovlps %xmm0, (%rax) 129; AVX2-FP-NEXT: vzeroupper 130; AVX2-FP-NEXT: retq 131; 132; AVX2-FCP-LABEL: load_i32_stride6_vf2: 133; AVX2-FCP: # %bb.0: 134; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 135; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0 136; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm1 137; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm2 138; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm3 139; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3] 140; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3] 141; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] 142; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] 143; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3] 144; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] 145; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 146; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] 147; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0] 148; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] 149; AVX2-FCP-NEXT: vpermps %ymm0, %ymm3, %ymm3 150; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] 151; AVX2-FCP-NEXT: vpermps %ymm0, %ymm6, %ymm0 152; AVX2-FCP-NEXT: vmovlps %xmm4, (%rsi) 153; AVX2-FCP-NEXT: vmovlps %xmm2, (%rdx) 154; AVX2-FCP-NEXT: vmovlps %xmm5, (%rcx) 155; AVX2-FCP-NEXT: vmovlps %xmm1, (%r8) 156; AVX2-FCP-NEXT: vmovlps %xmm3, (%r9) 157; AVX2-FCP-NEXT: vmovlps %xmm0, (%rax) 158; AVX2-FCP-NEXT: vzeroupper 159; AVX2-FCP-NEXT: retq 160; 161; AVX512-LABEL: load_i32_stride6_vf2: 162; AVX512: # %bb.0: 163; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 164; AVX512-NEXT: vmovdqa (%rdi), %xmm0 165; AVX512-NEXT: vmovaps 16(%rdi), %xmm1 166; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 167; AVX512-NEXT: vextractps $2, %xmm1, %r10d 168; AVX512-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 169; AVX512-NEXT: vextractps $3, %xmm1, %r10d 170; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 171; AVX512-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 172; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm4 173; AVX512-NEXT: vmovd %xmm2, %r10d 174; AVX512-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 175; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 176; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] 177; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] 178; AVX512-NEXT: vmovaps 32(%rdi), %ymm5 179; AVX512-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] 180; AVX512-NEXT: vpermps %ymm5, %ymm2, %ymm2 181; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] 182; AVX512-NEXT: vpermps %ymm5, %ymm6, %ymm5 183; AVX512-NEXT: vmovq %xmm3, (%rsi) 184; AVX512-NEXT: vmovq %xmm1, (%rdx) 185; AVX512-NEXT: vmovq %xmm4, (%rcx) 186; AVX512-NEXT: vmovq %xmm0, (%r8) 187; AVX512-NEXT: vmovlps %xmm2, (%r9) 188; AVX512-NEXT: vmovlps %xmm5, (%rax) 189; AVX512-NEXT: vzeroupper 190; AVX512-NEXT: retq 191; 192; AVX512-FCP-LABEL: load_i32_stride6_vf2: 193; AVX512-FCP: # %bb.0: 194; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 195; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] 196; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm1 197; AVX512-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 198; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] 199; AVX512-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 200; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] 201; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 202; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 203; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 204; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0] 205; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6 206; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] 207; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] 208; AVX512-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 209; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] 210; AVX512-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 211; AVX512-FCP-NEXT: vmovlps %xmm0, (%rsi) 212; AVX512-FCP-NEXT: vmovlps %xmm2, (%rdx) 213; AVX512-FCP-NEXT: vmovq %xmm3, (%rcx) 214; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) 215; AVX512-FCP-NEXT: vmovlps %xmm4, (%r9) 216; AVX512-FCP-NEXT: vmovlps %xmm1, (%rax) 217; AVX512-FCP-NEXT: vzeroupper 218; AVX512-FCP-NEXT: retq 219; 220; AVX512DQ-LABEL: load_i32_stride6_vf2: 221; AVX512DQ: # %bb.0: 222; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 223; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 224; AVX512DQ-NEXT: vmovaps 16(%rdi), %xmm1 225; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 226; AVX512DQ-NEXT: vextractps $2, %xmm1, %r10d 227; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 228; AVX512DQ-NEXT: vextractps $3, %xmm1, %r10d 229; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 230; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 231; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm4 232; AVX512DQ-NEXT: vmovd %xmm2, %r10d 233; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 234; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 235; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] 236; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] 237; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm5 238; AVX512DQ-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] 239; AVX512DQ-NEXT: vpermps %ymm5, %ymm2, %ymm2 240; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] 241; AVX512DQ-NEXT: vpermps %ymm5, %ymm6, %ymm5 242; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) 243; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) 244; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) 245; AVX512DQ-NEXT: vmovq %xmm0, (%r8) 246; AVX512DQ-NEXT: vmovlps %xmm2, (%r9) 247; AVX512DQ-NEXT: vmovlps %xmm5, (%rax) 248; AVX512DQ-NEXT: vzeroupper 249; AVX512DQ-NEXT: retq 250; 251; AVX512DQ-FCP-LABEL: load_i32_stride6_vf2: 252; AVX512DQ-FCP: # %bb.0: 253; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 254; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] 255; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm1 256; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 257; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] 258; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 259; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] 260; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 261; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 262; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 263; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0] 264; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6 265; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] 266; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] 267; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 268; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] 269; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 270; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rsi) 271; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rdx) 272; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rcx) 273; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) 274; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%r9) 275; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rax) 276; AVX512DQ-FCP-NEXT: vzeroupper 277; AVX512DQ-FCP-NEXT: retq 278; 279; AVX512BW-LABEL: load_i32_stride6_vf2: 280; AVX512BW: # %bb.0: 281; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 282; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 283; AVX512BW-NEXT: vmovaps 16(%rdi), %xmm1 284; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 285; AVX512BW-NEXT: vextractps $2, %xmm1, %r10d 286; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 287; AVX512BW-NEXT: vextractps $3, %xmm1, %r10d 288; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 289; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 290; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 291; AVX512BW-NEXT: vmovd %xmm2, %r10d 292; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 293; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 294; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] 295; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] 296; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm5 297; AVX512BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] 298; AVX512BW-NEXT: vpermps %ymm5, %ymm2, %ymm2 299; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] 300; AVX512BW-NEXT: vpermps %ymm5, %ymm6, %ymm5 301; AVX512BW-NEXT: vmovq %xmm3, (%rsi) 302; AVX512BW-NEXT: vmovq %xmm1, (%rdx) 303; AVX512BW-NEXT: vmovq %xmm4, (%rcx) 304; AVX512BW-NEXT: vmovq %xmm0, (%r8) 305; AVX512BW-NEXT: vmovlps %xmm2, (%r9) 306; AVX512BW-NEXT: vmovlps %xmm5, (%rax) 307; AVX512BW-NEXT: vzeroupper 308; AVX512BW-NEXT: retq 309; 310; AVX512BW-FCP-LABEL: load_i32_stride6_vf2: 311; AVX512BW-FCP: # %bb.0: 312; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 313; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] 314; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm1 315; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 316; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] 317; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 318; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] 319; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 320; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 321; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 322; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0] 323; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6 324; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] 325; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] 326; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 327; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] 328; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 329; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rsi) 330; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rdx) 331; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) 332; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) 333; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%r9) 334; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rax) 335; AVX512BW-FCP-NEXT: vzeroupper 336; AVX512BW-FCP-NEXT: retq 337; 338; AVX512DQ-BW-LABEL: load_i32_stride6_vf2: 339; AVX512DQ-BW: # %bb.0: 340; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 341; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 342; AVX512DQ-BW-NEXT: vmovaps 16(%rdi), %xmm1 343; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2 344; AVX512DQ-BW-NEXT: vextractps $2, %xmm1, %r10d 345; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 346; AVX512DQ-BW-NEXT: vextractps $3, %xmm1, %r10d 347; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 348; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 349; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 350; AVX512DQ-BW-NEXT: vmovd %xmm2, %r10d 351; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 352; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 353; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] 354; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] 355; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm5 356; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] 357; AVX512DQ-BW-NEXT: vpermps %ymm5, %ymm2, %ymm2 358; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] 359; AVX512DQ-BW-NEXT: vpermps %ymm5, %ymm6, %ymm5 360; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi) 361; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) 362; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) 363; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) 364; AVX512DQ-BW-NEXT: vmovlps %xmm2, (%r9) 365; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%rax) 366; AVX512DQ-BW-NEXT: vzeroupper 367; AVX512DQ-BW-NEXT: retq 368; 369; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf2: 370; AVX512DQ-BW-FCP: # %bb.0: 371; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 372; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] 373; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm1 374; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 375; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] 376; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 377; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] 378; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 379; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 380; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 381; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0] 382; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6 383; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] 384; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] 385; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 386; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] 387; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 388; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rsi) 389; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rdx) 390; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) 391; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) 392; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%r9) 393; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rax) 394; AVX512DQ-BW-FCP-NEXT: vzeroupper 395; AVX512DQ-BW-FCP-NEXT: retq 396 %wide.vec = load <12 x i32>, ptr %in.vec, align 64 397 %strided.vec0 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <2 x i32> <i32 0, i32 6> 398 %strided.vec1 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <2 x i32> <i32 1, i32 7> 399 %strided.vec2 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <2 x i32> <i32 2, i32 8> 400 %strided.vec3 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <2 x i32> <i32 3, i32 9> 401 %strided.vec4 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <2 x i32> <i32 4, i32 10> 402 %strided.vec5 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <2 x i32> <i32 5, i32 11> 403 store <2 x i32> %strided.vec0, ptr %out.vec0, align 64 404 store <2 x i32> %strided.vec1, ptr %out.vec1, align 64 405 store <2 x i32> %strided.vec2, ptr %out.vec2, align 64 406 store <2 x i32> %strided.vec3, ptr %out.vec3, align 64 407 store <2 x i32> %strided.vec4, ptr %out.vec4, align 64 408 store <2 x i32> %strided.vec5, ptr %out.vec5, align 64 409 ret void 410} 411 412define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 413; SSE-LABEL: load_i32_stride6_vf4: 414; SSE: # %bb.0: 415; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 416; SSE-NEXT: movdqa 80(%rdi), %xmm1 417; SSE-NEXT: movdqa 64(%rdi), %xmm0 418; SSE-NEXT: movdqa (%rdi), %xmm6 419; SSE-NEXT: movdqa 16(%rdi), %xmm2 420; SSE-NEXT: movdqa 48(%rdi), %xmm3 421; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] 422; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,1,1] 423; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[2,3,2,3] 424; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[3,3,3,3] 425; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 426; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] 427; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] 428; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 429; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] 430; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,3,3,3] 431; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] 432; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,2,3] 433; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] 434; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] 435; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] 436; SSE-NEXT: movdqa 32(%rdi), %xmm10 437; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] 438; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] 439; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] 440; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] 441; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm8[0],xmm5[1] 442; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] 443; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] 444; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] 445; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] 446; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[2,3,2,3] 447; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] 448; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] 449; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,2,3,3] 450; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] 451; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] 452; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] 453; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,3,3,3] 454; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] 455; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 456; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 457; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] 458; SSE-NEXT: movapd %xmm4, (%rsi) 459; SSE-NEXT: movapd %xmm3, (%rdx) 460; SSE-NEXT: movapd %xmm5, (%rcx) 461; SSE-NEXT: movapd %xmm6, (%r8) 462; SSE-NEXT: movapd %xmm9, (%r9) 463; SSE-NEXT: movapd %xmm0, (%rax) 464; SSE-NEXT: retq 465; 466; AVX-LABEL: load_i32_stride6_vf4: 467; AVX: # %bb.0: 468; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 469; AVX-NEXT: vmovaps 32(%rdi), %ymm0 470; AVX-NEXT: vmovaps (%rdi), %ymm1 471; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 472; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 473; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1],xmm3[2,3] 474; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,3] 475; AVX-NEXT: vmovaps 64(%rdi), %xmm5 476; AVX-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[2] 477; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[3,0] 478; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[1,3] 479; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3] 480; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 481; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 482; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,3] 483; AVX-NEXT: vmovaps 80(%rdi), %xmm6 484; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[0] 485; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[3,3] 486; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[1] 487; AVX-NEXT: vmovaps 32(%rdi), %xmm1 488; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm1[2,2,3,3] 489; AVX-NEXT: vmovaps 16(%rdi), %xmm8 490; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3] 491; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1],xmm6[2,3] 492; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,0,2] 493; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] 494; AVX-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero 495; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6,7] 496; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] 497; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3] 498; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] 499; AVX-NEXT: vmovaps %xmm4, (%rsi) 500; AVX-NEXT: vmovaps %xmm2, (%rdx) 501; AVX-NEXT: vmovaps %xmm3, (%rcx) 502; AVX-NEXT: vmovaps %xmm0, (%r8) 503; AVX-NEXT: vmovaps %xmm7, (%r9) 504; AVX-NEXT: vmovaps %xmm1, (%rax) 505; AVX-NEXT: vzeroupper 506; AVX-NEXT: retq 507; 508; AVX2-LABEL: load_i32_stride6_vf4: 509; AVX2: # %bb.0: 510; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 511; AVX2-NEXT: vmovdqa (%rdi), %ymm1 512; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 513; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,4,0] 514; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] 515; AVX2-NEXT: vpermd %ymm3, %ymm0, %ymm0 516; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4 517; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] 518; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3] 519; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm5 = [1,7,5,0] 520; AVX2-NEXT: vpermd %ymm3, %ymm5, %ymm3 521; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] 522; AVX2-NEXT: vmovdqa 80(%rdi), %xmm5 523; AVX2-NEXT: vpbroadcastd %xmm5, %xmm6 524; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] 525; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[2,0,2,3,6,4,6,7] 526; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,3,2,3] 527; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] 528; AVX2-NEXT: vpbroadcastd 84(%rdi), %xmm8 529; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[3,1,3,3,7,5,7,7] 530; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,3,2,3] 531; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3] 532; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm5[2,3] 533; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2] 534; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,2,0,0] 535; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 536; AVX2-NEXT: vpermd %ymm1, %ymm9, %ymm2 537; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] 538; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 539; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3] 540; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] 541; AVX2-NEXT: vpermd %ymm1, %ymm5, %ymm1 542; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] 543; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 544; AVX2-NEXT: vmovdqa %xmm3, (%rdx) 545; AVX2-NEXT: vmovdqa %xmm6, (%rcx) 546; AVX2-NEXT: vmovdqa %xmm7, (%r8) 547; AVX2-NEXT: vmovdqa %xmm2, (%r9) 548; AVX2-NEXT: vmovdqa %xmm1, (%rax) 549; AVX2-NEXT: vzeroupper 550; AVX2-NEXT: retq 551; 552; AVX2-FP-LABEL: load_i32_stride6_vf4: 553; AVX2-FP: # %bb.0: 554; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 555; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 556; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm2 557; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,4,0] 558; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] 559; AVX2-FP-NEXT: vpermd %ymm3, %ymm0, %ymm0 560; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm4 561; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] 562; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3] 563; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [1,7,5,0] 564; AVX2-FP-NEXT: vpermd %ymm3, %ymm5, %ymm3 565; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] 566; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm5 567; AVX2-FP-NEXT: vpbroadcastd %xmm5, %xmm6 568; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] 569; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[2,0,2,3,6,4,6,7] 570; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,3,2,3] 571; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] 572; AVX2-FP-NEXT: vpbroadcastd 84(%rdi), %xmm8 573; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[3,1,3,3,7,5,7,7] 574; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,3,2,3] 575; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3] 576; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm5[2,3] 577; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2] 578; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,2,0,0] 579; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 580; AVX2-FP-NEXT: vpermd %ymm1, %ymm9, %ymm2 581; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] 582; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 583; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3] 584; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] 585; AVX2-FP-NEXT: vpermd %ymm1, %ymm5, %ymm1 586; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] 587; AVX2-FP-NEXT: vmovdqa %xmm0, (%rsi) 588; AVX2-FP-NEXT: vmovdqa %xmm3, (%rdx) 589; AVX2-FP-NEXT: vmovdqa %xmm6, (%rcx) 590; AVX2-FP-NEXT: vmovdqa %xmm7, (%r8) 591; AVX2-FP-NEXT: vmovdqa %xmm2, (%r9) 592; AVX2-FP-NEXT: vmovdqa %xmm1, (%rax) 593; AVX2-FP-NEXT: vzeroupper 594; AVX2-FP-NEXT: retq 595; 596; AVX2-FCP-LABEL: load_i32_stride6_vf4: 597; AVX2-FCP: # %bb.0: 598; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 599; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 600; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 601; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,4,0] 602; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] 603; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm0 604; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 605; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] 606; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3] 607; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [1,7,5,0] 608; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm3 609; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] 610; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,0,6,7] 611; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] 612; AVX2-FCP-NEXT: vpermd %ymm6, %ymm5, %ymm5 613; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm7 614; AVX2-FCP-NEXT: vpbroadcastd %xmm7, %xmm8 615; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3] 616; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,1,7,7] 617; AVX2-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 618; AVX2-FCP-NEXT: vpbroadcastd 84(%rdi), %xmm8 619; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3] 620; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3] 621; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2] 622; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,2,0,0] 623; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 624; AVX2-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm2 625; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] 626; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 627; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3] 628; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,3,0,0] 629; AVX2-FCP-NEXT: vpermd %ymm1, %ymm7, %ymm1 630; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] 631; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rsi) 632; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rdx) 633; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rcx) 634; AVX2-FCP-NEXT: vmovdqa %xmm6, (%r8) 635; AVX2-FCP-NEXT: vmovdqa %xmm2, (%r9) 636; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rax) 637; AVX2-FCP-NEXT: vzeroupper 638; AVX2-FCP-NEXT: retq 639; 640; AVX512-LABEL: load_i32_stride6_vf4: 641; AVX512: # %bb.0: 642; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 643; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 644; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 645; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] 646; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 647; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] 648; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 649; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] 650; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 651; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] 652; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 653; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] 654; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 655; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] 656; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 657; AVX512-NEXT: vmovdqa %xmm2, (%rsi) 658; AVX512-NEXT: vmovdqa %xmm3, (%rdx) 659; AVX512-NEXT: vmovdqa %xmm4, (%rcx) 660; AVX512-NEXT: vmovdqa %xmm5, (%r8) 661; AVX512-NEXT: vmovdqa %xmm6, (%r9) 662; AVX512-NEXT: vmovdqa %xmm7, (%rax) 663; AVX512-NEXT: vzeroupper 664; AVX512-NEXT: retq 665; 666; AVX512-FCP-LABEL: load_i32_stride6_vf4: 667; AVX512-FCP: # %bb.0: 668; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 669; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 670; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 671; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] 672; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 673; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] 674; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 675; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] 676; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 677; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] 678; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 679; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] 680; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 681; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] 682; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 683; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rsi) 684; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) 685; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rcx) 686; AVX512-FCP-NEXT: vmovdqa %xmm5, (%r8) 687; AVX512-FCP-NEXT: vmovdqa %xmm6, (%r9) 688; AVX512-FCP-NEXT: vmovdqa %xmm7, (%rax) 689; AVX512-FCP-NEXT: vzeroupper 690; AVX512-FCP-NEXT: retq 691; 692; AVX512DQ-LABEL: load_i32_stride6_vf4: 693; AVX512DQ: # %bb.0: 694; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 695; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 696; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 697; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] 698; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 699; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] 700; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 701; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] 702; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 703; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] 704; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 705; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] 706; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 707; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] 708; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 709; AVX512DQ-NEXT: vmovdqa %xmm2, (%rsi) 710; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) 711; AVX512DQ-NEXT: vmovdqa %xmm4, (%rcx) 712; AVX512DQ-NEXT: vmovdqa %xmm5, (%r8) 713; AVX512DQ-NEXT: vmovdqa %xmm6, (%r9) 714; AVX512DQ-NEXT: vmovdqa %xmm7, (%rax) 715; AVX512DQ-NEXT: vzeroupper 716; AVX512DQ-NEXT: retq 717; 718; AVX512DQ-FCP-LABEL: load_i32_stride6_vf4: 719; AVX512DQ-FCP: # %bb.0: 720; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 721; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 722; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 723; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] 724; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 725; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] 726; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 727; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] 728; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 729; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] 730; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 731; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] 732; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 733; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] 734; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 735; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rsi) 736; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) 737; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rcx) 738; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%r8) 739; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%r9) 740; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%rax) 741; AVX512DQ-FCP-NEXT: vzeroupper 742; AVX512DQ-FCP-NEXT: retq 743; 744; AVX512BW-LABEL: load_i32_stride6_vf4: 745; AVX512BW: # %bb.0: 746; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 747; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 748; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 749; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] 750; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 751; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] 752; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 753; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] 754; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 755; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] 756; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 757; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] 758; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 759; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] 760; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 761; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) 762; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) 763; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx) 764; AVX512BW-NEXT: vmovdqa %xmm5, (%r8) 765; AVX512BW-NEXT: vmovdqa %xmm6, (%r9) 766; AVX512BW-NEXT: vmovdqa %xmm7, (%rax) 767; AVX512BW-NEXT: vzeroupper 768; AVX512BW-NEXT: retq 769; 770; AVX512BW-FCP-LABEL: load_i32_stride6_vf4: 771; AVX512BW-FCP: # %bb.0: 772; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 773; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 774; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 775; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] 776; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 777; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] 778; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 779; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] 780; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 781; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] 782; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 783; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] 784; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 785; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] 786; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 787; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) 788; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) 789; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) 790; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r8) 791; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r9) 792; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%rax) 793; AVX512BW-FCP-NEXT: vzeroupper 794; AVX512BW-FCP-NEXT: retq 795; 796; AVX512DQ-BW-LABEL: load_i32_stride6_vf4: 797; AVX512DQ-BW: # %bb.0: 798; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 799; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 800; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 801; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] 802; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 803; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] 804; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 805; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] 806; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 807; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] 808; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 809; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] 810; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 811; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] 812; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 813; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) 814; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) 815; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx) 816; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%r8) 817; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%r9) 818; AVX512DQ-BW-NEXT: vmovdqa %xmm7, (%rax) 819; AVX512DQ-BW-NEXT: vzeroupper 820; AVX512DQ-BW-NEXT: retq 821; 822; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf4: 823; AVX512DQ-BW-FCP: # %bb.0: 824; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 825; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 826; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 827; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] 828; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 829; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] 830; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 831; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] 832; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 833; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] 834; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 835; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] 836; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 837; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] 838; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 839; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) 840; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) 841; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) 842; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r8) 843; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r9) 844; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%rax) 845; AVX512DQ-BW-FCP-NEXT: vzeroupper 846; AVX512DQ-BW-FCP-NEXT: retq 847 %wide.vec = load <24 x i32>, ptr %in.vec, align 64 848 %strided.vec0 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <4 x i32> <i32 0, i32 6, i32 12, i32 18> 849 %strided.vec1 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <4 x i32> <i32 1, i32 7, i32 13, i32 19> 850 %strided.vec2 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <4 x i32> <i32 2, i32 8, i32 14, i32 20> 851 %strided.vec3 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <4 x i32> <i32 3, i32 9, i32 15, i32 21> 852 %strided.vec4 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <4 x i32> <i32 4, i32 10, i32 16, i32 22> 853 %strided.vec5 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <4 x i32> <i32 5, i32 11, i32 17, i32 23> 854 store <4 x i32> %strided.vec0, ptr %out.vec0, align 64 855 store <4 x i32> %strided.vec1, ptr %out.vec1, align 64 856 store <4 x i32> %strided.vec2, ptr %out.vec2, align 64 857 store <4 x i32> %strided.vec3, ptr %out.vec3, align 64 858 store <4 x i32> %strided.vec4, ptr %out.vec4, align 64 859 store <4 x i32> %strided.vec5, ptr %out.vec5, align 64 860 ret void 861} 862 863define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 864; SSE-LABEL: load_i32_stride6_vf8: 865; SSE: # %bb.0: 866; SSE-NEXT: movdqa 144(%rdi), %xmm4 867; SSE-NEXT: movdqa 160(%rdi), %xmm2 868; SSE-NEXT: movdqa 96(%rdi), %xmm6 869; SSE-NEXT: movdqa 112(%rdi), %xmm3 870; SSE-NEXT: movdqa 64(%rdi), %xmm5 871; SSE-NEXT: movdqa (%rdi), %xmm10 872; SSE-NEXT: movdqa 16(%rdi), %xmm11 873; SSE-NEXT: movdqa 48(%rdi), %xmm8 874; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] 875; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 876; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] 877; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[2,3,2,3] 878; SSE-NEXT: movdqa %xmm10, %xmm7 879; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] 880; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] 881; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] 882; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] 883; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] 884; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 885; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] 886; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm6[1,1,1,1] 887; SSE-NEXT: movdqa %xmm6, %xmm9 888; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] 889; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] 890; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,1,1] 891; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] 892; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm9[0],xmm7[1] 893; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 894; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] 895; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 896; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] 897; SSE-NEXT: movdqa %xmm5, %xmm9 898; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,2,3,3] 899; SSE-NEXT: movdqa %xmm8, %xmm11 900; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] 901; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] 902; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] 903; SSE-NEXT: movdqa %xmm3, %xmm1 904; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 905; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] 906; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 907; SSE-NEXT: movdqa %xmm4, %xmm12 908; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] 909; SSE-NEXT: movdqa 80(%rdi), %xmm14 910; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm15[0],xmm12[1] 911; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] 912; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 913; SSE-NEXT: movdqa 32(%rdi), %xmm7 914; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] 915; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm13[0],xmm5[1] 916; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 917; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[2,2,3,3] 918; SSE-NEXT: movdqa 176(%rdi), %xmm15 919; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1] 920; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] 921; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] 922; SSE-NEXT: movdqa 128(%rdi), %xmm5 923; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 924; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] 925; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] 926; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,1,1] 927; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] 928; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] 929; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1] 930; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] 931; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] 932; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] 933; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 934; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] 935; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] 936; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] 937; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] 938; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 939; SSE-NEXT: movdqa %xmm3, %xmm10 940; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] 941; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] 942; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,1,1] 943; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] 944; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm10[0],xmm6[1] 945; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[2,3,2,3] 946; SSE-NEXT: movdqa %xmm1, %xmm0 947; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] 948; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] 949; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,1,1] 950; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] 951; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] 952; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] 953; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] 954; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 955; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] 956; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] 957; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] 958; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 959; SSE-NEXT: # xmm0 = mem[1,1,1,1] 960; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] 961; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 962; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] 963; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 964; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 965; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 966; SSE-NEXT: movaps %xmm0, 16(%rsi) 967; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 968; SSE-NEXT: movaps %xmm0, (%rsi) 969; SSE-NEXT: movapd %xmm12, 16(%rdx) 970; SSE-NEXT: movapd %xmm11, (%rdx) 971; SSE-NEXT: movapd %xmm13, 16(%rcx) 972; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 973; SSE-NEXT: movaps %xmm0, (%rcx) 974; SSE-NEXT: movapd %xmm4, 16(%r8) 975; SSE-NEXT: movapd %xmm8, (%r8) 976; SSE-NEXT: movapd %xmm10, 16(%r9) 977; SSE-NEXT: movapd %xmm6, (%r9) 978; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 979; SSE-NEXT: movapd %xmm2, 16(%rax) 980; SSE-NEXT: movapd %xmm9, (%rax) 981; SSE-NEXT: retq 982; 983; AVX-LABEL: load_i32_stride6_vf8: 984; AVX: # %bb.0: 985; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 986; AVX-NEXT: vmovapd 160(%rdi), %ymm3 987; AVX-NEXT: vmovapd 128(%rdi), %ymm4 988; AVX-NEXT: vmovaps 32(%rdi), %ymm6 989; AVX-NEXT: vmovaps (%rdi), %ymm7 990; AVX-NEXT: vmovaps 96(%rdi), %ymm0 991; AVX-NEXT: vmovaps 64(%rdi), %ymm1 992; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm5 993; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,0],ymm5[0,0],ymm0[6,4],ymm5[4,4] 994; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm5[2,2],ymm2[6,4],ymm5[6,6] 995; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] 996; AVX-NEXT: vextractf128 $1, %ymm8, %xmm9 997; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1],xmm9[2,3] 998; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[0,3] 999; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7] 1000; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm4[2,3],ymm3[0,1] 1001; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm10[0],ymm4[1],ymm10[3],ymm4[2] 1002; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] 1003; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] 1004; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm0[3,0],ymm5[1,0],ymm0[7,4],ymm5[5,4] 1005; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm11[2,0],ymm5[2,3],ymm11[6,4],ymm5[6,7] 1006; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,0],xmm9[3,0] 1007; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,2],xmm9[1,3] 1008; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] 1009; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm10[3,1],ymm4[1,3],ymm10[7,5],ymm4[5,7] 1010; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] 1011; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] 1012; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3] 1013; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1] 1014; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm9[0,0],ymm8[2,0],ymm9[4,4],ymm8[6,4] 1015; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] 1016; AVX-NEXT: vextractf128 $1, %ymm7, %xmm11 1017; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,0],xmm11[2,3] 1018; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm1[2,0],ymm0[6,5],ymm1[6,4] 1019; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] 1020; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7] 1021; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm10[5,6,7] 1022; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[3,1],ymm9[4,5],ymm8[7,5] 1023; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,1],xmm11[3,3] 1024; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1],ymm1[2,1],ymm0[7,5],ymm1[6,5] 1025; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,0,1] 1026; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] 1027; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] 1028; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3] 1029; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1] 1030; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm3[2,0],ymm4[0,0],ymm3[6,4],ymm4[4,4] 1031; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,2],ymm8[2,0],ymm4[4,6],ymm8[6,4] 1032; AVX-NEXT: vmovaps 32(%rdi), %xmm9 1033; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,2,3,3] 1034; AVX-NEXT: vmovaps 16(%rdi), %xmm11 1035; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] 1036; AVX-NEXT: vmovapd 80(%rdi), %xmm12 1037; AVX-NEXT: vshufpd {{.*#+}} ymm13 = ymm12[1],ymm1[0],ymm12[2],ymm1[3] 1038; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,1],ymm13[2,0],ymm0[4,5],ymm13[6,4] 1039; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3,4,5,6,7] 1040; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7] 1041; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm4[1,0],ymm3[7,4],ymm4[5,4] 1042; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,3],ymm3[2,0],ymm4[4,7],ymm3[6,4] 1043; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm9[2,3] 1044; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] 1045; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1],ymm1[1,3],ymm12[7,5],ymm1[5,7] 1046; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[2,0],ymm0[5,5],ymm1[6,4] 1047; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] 1048; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] 1049; AVX-NEXT: vmovaps %ymm2, (%rsi) 1050; AVX-NEXT: vmovaps %ymm5, (%rdx) 1051; AVX-NEXT: vmovaps %ymm6, (%rcx) 1052; AVX-NEXT: vmovaps %ymm7, (%r8) 1053; AVX-NEXT: vmovaps %ymm8, (%r9) 1054; AVX-NEXT: vmovaps %ymm0, (%rax) 1055; AVX-NEXT: vzeroupper 1056; AVX-NEXT: retq 1057; 1058; AVX2-LABEL: load_i32_stride6_vf8: 1059; AVX2: # %bb.0: 1060; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 1061; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 1062; AVX2-NEXT: vmovaps 160(%rdi), %ymm1 1063; AVX2-NEXT: vmovaps 96(%rdi), %ymm5 1064; AVX2-NEXT: vmovaps (%rdi), %ymm2 1065; AVX2-NEXT: vmovaps 32(%rdi), %ymm4 1066; AVX2-NEXT: vmovaps 64(%rdi), %ymm6 1067; AVX2-NEXT: vmovaps {{.*#+}} xmm3 = [0,6,4,u] 1068; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] 1069; AVX2-NEXT: vpermps %ymm7, %ymm3, %ymm3 1070; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm6[0,1],ymm5[0,1] 1071; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm5[6,7] 1072; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,2,2,2,4,6,6,6] 1073; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3,4,5,6,7] 1074; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1075; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm8 = [4,2,4,2,4,2,4,2] 1076; AVX2-NEXT: vpermps %ymm10, %ymm8, %ymm11 1077; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] 1078; AVX2-NEXT: vmovaps {{.*#+}} xmm11 = [1,7,5,u] 1079; AVX2-NEXT: vpermps %ymm7, %ymm11, %ymm7 1080; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] 1081; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] 1082; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3] 1083; AVX2-NEXT: vpermps %ymm10, %ymm9, %ymm10 1084; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] 1085; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] 1086; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] 1087; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,0,3] 1088; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] 1089; AVX2-NEXT: vshufps {{.*#+}} ymm12 = ymm11[2,0,2,3,6,4,6,7] 1090; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,3,2,3] 1091; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7] 1092; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 1093; AVX2-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,0,2,0,4,4,6,4] 1094; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] 1095; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] 1096; AVX2-NEXT: vshufps {{.*#+}} ymm13 = ymm5[3,3,3,3,7,7,7,7] 1097; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm6[1],ymm13[2,3,4],ymm6[5],ymm13[6,7] 1098; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,0,3] 1099; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,1,3,3,7,5,7,7] 1100; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,3,2,3] 1101; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7] 1102; AVX2-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,7,5] 1103; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] 1104; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5,6,7] 1105; AVX2-NEXT: vmovaps 80(%rdi), %xmm12 1106; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 1107; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] 1108; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,0,2,4,5,4,6] 1109; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] 1110; AVX2-NEXT: vpermps %ymm2, %ymm8, %ymm4 1111; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5,6,7] 1112; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 1113; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,0,6,0,2,0,6] 1114; AVX2-NEXT: # ymm1 = mem[0,1,0,1] 1115; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1 1116; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] 1117; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] 1118; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3],ymm4[4,5,6,7] 1119; AVX2-NEXT: vpermps %ymm2, %ymm9, %ymm2 1120; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] 1121; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7] 1122; AVX2-NEXT: # ymm4 = mem[0,1,0,1] 1123; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm0 1124; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] 1125; AVX2-NEXT: vmovaps %ymm3, (%rsi) 1126; AVX2-NEXT: vmovaps %ymm7, (%rdx) 1127; AVX2-NEXT: vmovaps %ymm10, (%rcx) 1128; AVX2-NEXT: vmovaps %ymm11, (%r8) 1129; AVX2-NEXT: vmovaps %ymm1, (%r9) 1130; AVX2-NEXT: vmovaps %ymm0, (%rax) 1131; AVX2-NEXT: vzeroupper 1132; AVX2-NEXT: retq 1133; 1134; AVX2-FP-LABEL: load_i32_stride6_vf8: 1135; AVX2-FP: # %bb.0: 1136; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1137; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0 1138; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm1 1139; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm5 1140; AVX2-FP-NEXT: vmovaps (%rdi), %ymm2 1141; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm4 1142; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm6 1143; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm3 = [0,6,4,u] 1144; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] 1145; AVX2-FP-NEXT: vpermps %ymm7, %ymm3, %ymm3 1146; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm6[0,1],ymm5[0,1] 1147; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm5[6,7] 1148; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,2,2,2,4,6,6,6] 1149; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3,4,5,6,7] 1150; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1151; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm8 = [4,2,4,2,4,2,4,2] 1152; AVX2-FP-NEXT: vpermps %ymm10, %ymm8, %ymm11 1153; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] 1154; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm11 = [1,7,5,u] 1155; AVX2-FP-NEXT: vpermps %ymm7, %ymm11, %ymm7 1156; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] 1157; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] 1158; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3] 1159; AVX2-FP-NEXT: vpermps %ymm10, %ymm9, %ymm10 1160; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] 1161; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] 1162; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] 1163; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,0,3] 1164; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] 1165; AVX2-FP-NEXT: vshufps {{.*#+}} ymm12 = ymm11[2,0,2,3,6,4,6,7] 1166; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,3,2,3] 1167; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7] 1168; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 1169; AVX2-FP-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,0,2,0,4,4,6,4] 1170; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] 1171; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] 1172; AVX2-FP-NEXT: vshufps {{.*#+}} ymm13 = ymm5[3,3,3,3,7,7,7,7] 1173; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm6[1],ymm13[2,3,4],ymm6[5],ymm13[6,7] 1174; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,0,3] 1175; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,1,3,3,7,5,7,7] 1176; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,3,2,3] 1177; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7] 1178; AVX2-FP-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,7,5] 1179; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] 1180; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5,6,7] 1181; AVX2-FP-NEXT: vmovaps 80(%rdi), %xmm12 1182; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 1183; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] 1184; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,0,2,4,5,4,6] 1185; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] 1186; AVX2-FP-NEXT: vpermps %ymm2, %ymm8, %ymm4 1187; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5,6,7] 1188; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 1189; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,0,6,0,2,0,6] 1190; AVX2-FP-NEXT: # ymm1 = mem[0,1,0,1] 1191; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm1 1192; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] 1193; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] 1194; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3],ymm4[4,5,6,7] 1195; AVX2-FP-NEXT: vpermps %ymm2, %ymm9, %ymm2 1196; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] 1197; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7] 1198; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1] 1199; AVX2-FP-NEXT: vpermps %ymm0, %ymm4, %ymm0 1200; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] 1201; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi) 1202; AVX2-FP-NEXT: vmovaps %ymm7, (%rdx) 1203; AVX2-FP-NEXT: vmovaps %ymm10, (%rcx) 1204; AVX2-FP-NEXT: vmovaps %ymm11, (%r8) 1205; AVX2-FP-NEXT: vmovaps %ymm1, (%r9) 1206; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) 1207; AVX2-FP-NEXT: vzeroupper 1208; AVX2-FP-NEXT: retq 1209; 1210; AVX2-FCP-LABEL: load_i32_stride6_vf8: 1211; AVX2-FCP: # %bb.0: 1212; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1213; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm0 1214; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm1 1215; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm5 1216; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm2 1217; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm4 1218; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm6 1219; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [0,6,4,u] 1220; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] 1221; AVX2-FCP-NEXT: vpermps %ymm7, %ymm3, %ymm3 1222; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm6[0,1],ymm5[0,1] 1223; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm5[6,7] 1224; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,2,2,2,4,6,6,6] 1225; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3,4,5,6,7] 1226; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1227; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm8 = [4,2,4,2,4,2,4,2] 1228; AVX2-FCP-NEXT: vpermps %ymm10, %ymm8, %ymm11 1229; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] 1230; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm11 = [1,7,5,u] 1231; AVX2-FCP-NEXT: vpermps %ymm7, %ymm11, %ymm7 1232; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] 1233; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] 1234; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3] 1235; AVX2-FCP-NEXT: vpermps %ymm10, %ymm9, %ymm10 1236; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] 1237; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] 1238; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm11 = [2,4,2,4,2,4,2,4] 1239; AVX2-FCP-NEXT: vpermps %ymm10, %ymm11, %ymm10 1240; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm11 = [2,0,6,7] 1241; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] 1242; AVX2-FCP-NEXT: vpermps %ymm12, %ymm11, %ymm11 1243; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7] 1244; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 1245; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [0,0,6,4,0,0,6,4] 1246; AVX2-FCP-NEXT: # ymm13 = mem[0,1,0,1] 1247; AVX2-FCP-NEXT: vpermps %ymm11, %ymm13, %ymm13 1248; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] 1249; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [0,1,7,5,0,1,7,5] 1250; AVX2-FCP-NEXT: # ymm13 = mem[0,1,0,1] 1251; AVX2-FCP-NEXT: vpermps %ymm11, %ymm13, %ymm11 1252; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm13 = mem[3,3,3,3] 1253; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm6[1],ymm13[2,3,4],ymm6[5],ymm13[6,7] 1254; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5],ymm12[6,7] 1255; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm13 = [3,1,7,5,0,u,u,u] 1256; AVX2-FCP-NEXT: vpermps %ymm12, %ymm13, %ymm12 1257; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5,6,7] 1258; AVX2-FCP-NEXT: vmovaps 80(%rdi), %xmm12 1259; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 1260; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] 1261; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,0,2,4,5,4,6] 1262; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] 1263; AVX2-FCP-NEXT: vpermps %ymm2, %ymm8, %ymm4 1264; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5,6,7] 1265; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 1266; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,0,6,0,2,0,6] 1267; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] 1268; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm1 1269; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] 1270; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] 1271; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3],ymm4[4,5,6,7] 1272; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2 1273; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] 1274; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7] 1275; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] 1276; AVX2-FCP-NEXT: vpermps %ymm0, %ymm4, %ymm0 1277; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] 1278; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi) 1279; AVX2-FCP-NEXT: vmovaps %ymm7, (%rdx) 1280; AVX2-FCP-NEXT: vmovaps %ymm10, (%rcx) 1281; AVX2-FCP-NEXT: vmovaps %ymm11, (%r8) 1282; AVX2-FCP-NEXT: vmovaps %ymm1, (%r9) 1283; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) 1284; AVX2-FCP-NEXT: vzeroupper 1285; AVX2-FCP-NEXT: retq 1286; 1287; AVX512-LABEL: load_i32_stride6_vf8: 1288; AVX512: # %bb.0: 1289; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 1290; AVX512-NEXT: vmovdqa 128(%rdi), %ymm0 1291; AVX512-NEXT: vmovdqa 160(%rdi), %ymm1 1292; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1293; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] 1294; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 1295; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm5 1296; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6 1297; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1298; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] 1299; AVX512-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 1300; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] 1301; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1302; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] 1303; AVX512-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 1304; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] 1305; AVX512-NEXT: vpermd %zmm6, %zmm2, %zmm2 1306; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] 1307; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1308; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] 1309; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] 1310; AVX512-NEXT: vpermd %zmm6, %zmm3, %zmm3 1311; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] 1312; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 1313; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] 1314; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 1315; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] 1316; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 1317; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] 1318; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 1319; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] 1320; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 1321; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] 1322; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 1323; AVX512-NEXT: vmovdqa %ymm7, (%rsi) 1324; AVX512-NEXT: vmovdqa %ymm8, (%rdx) 1325; AVX512-NEXT: vmovdqa %ymm2, (%rcx) 1326; AVX512-NEXT: vmovdqa %ymm3, (%r8) 1327; AVX512-NEXT: vmovdqa %ymm6, (%r9) 1328; AVX512-NEXT: vmovdqa %ymm4, (%rax) 1329; AVX512-NEXT: vzeroupper 1330; AVX512-NEXT: retq 1331; 1332; AVX512-FCP-LABEL: load_i32_stride6_vf8: 1333; AVX512-FCP: # %bb.0: 1334; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1335; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 1336; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 1337; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1338; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] 1339; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 1340; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 1341; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 1342; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1343; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] 1344; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 1345; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] 1346; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1347; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] 1348; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 1349; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] 1350; AVX512-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2 1351; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] 1352; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1353; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] 1354; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] 1355; AVX512-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3 1356; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] 1357; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 1358; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] 1359; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 1360; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] 1361; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 1362; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] 1363; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 1364; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] 1365; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 1366; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] 1367; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 1368; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rsi) 1369; AVX512-FCP-NEXT: vmovdqa %ymm8, (%rdx) 1370; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rcx) 1371; AVX512-FCP-NEXT: vmovdqa %ymm3, (%r8) 1372; AVX512-FCP-NEXT: vmovdqa %ymm6, (%r9) 1373; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rax) 1374; AVX512-FCP-NEXT: vzeroupper 1375; AVX512-FCP-NEXT: retq 1376; 1377; AVX512DQ-LABEL: load_i32_stride6_vf8: 1378; AVX512DQ: # %bb.0: 1379; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 1380; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm0 1381; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm1 1382; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1383; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] 1384; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 1385; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm5 1386; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm6 1387; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1388; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] 1389; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 1390; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] 1391; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1392; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] 1393; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 1394; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] 1395; AVX512DQ-NEXT: vpermd %zmm6, %zmm2, %zmm2 1396; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] 1397; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1398; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] 1399; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] 1400; AVX512DQ-NEXT: vpermd %zmm6, %zmm3, %zmm3 1401; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] 1402; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 1403; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] 1404; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 1405; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] 1406; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 1407; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] 1408; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 1409; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] 1410; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 1411; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] 1412; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 1413; AVX512DQ-NEXT: vmovdqa %ymm7, (%rsi) 1414; AVX512DQ-NEXT: vmovdqa %ymm8, (%rdx) 1415; AVX512DQ-NEXT: vmovdqa %ymm2, (%rcx) 1416; AVX512DQ-NEXT: vmovdqa %ymm3, (%r8) 1417; AVX512DQ-NEXT: vmovdqa %ymm6, (%r9) 1418; AVX512DQ-NEXT: vmovdqa %ymm4, (%rax) 1419; AVX512DQ-NEXT: vzeroupper 1420; AVX512DQ-NEXT: retq 1421; 1422; AVX512DQ-FCP-LABEL: load_i32_stride6_vf8: 1423; AVX512DQ-FCP: # %bb.0: 1424; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1425; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 1426; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 1427; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1428; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] 1429; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 1430; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 1431; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 1432; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1433; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] 1434; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 1435; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] 1436; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1437; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] 1438; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 1439; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] 1440; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2 1441; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] 1442; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1443; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] 1444; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] 1445; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3 1446; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] 1447; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 1448; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] 1449; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 1450; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] 1451; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 1452; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] 1453; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 1454; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] 1455; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 1456; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] 1457; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 1458; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rsi) 1459; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%rdx) 1460; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rcx) 1461; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%r8) 1462; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%r9) 1463; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rax) 1464; AVX512DQ-FCP-NEXT: vzeroupper 1465; AVX512DQ-FCP-NEXT: retq 1466; 1467; AVX512BW-LABEL: load_i32_stride6_vf8: 1468; AVX512BW: # %bb.0: 1469; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1470; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm0 1471; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm1 1472; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1473; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] 1474; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 1475; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 1476; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 1477; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1478; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] 1479; AVX512BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 1480; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] 1481; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1482; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] 1483; AVX512BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 1484; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] 1485; AVX512BW-NEXT: vpermd %zmm6, %zmm2, %zmm2 1486; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] 1487; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1488; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] 1489; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] 1490; AVX512BW-NEXT: vpermd %zmm6, %zmm3, %zmm3 1491; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] 1492; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 1493; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] 1494; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 1495; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] 1496; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 1497; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] 1498; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 1499; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] 1500; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 1501; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] 1502; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 1503; AVX512BW-NEXT: vmovdqa %ymm7, (%rsi) 1504; AVX512BW-NEXT: vmovdqa %ymm8, (%rdx) 1505; AVX512BW-NEXT: vmovdqa %ymm2, (%rcx) 1506; AVX512BW-NEXT: vmovdqa %ymm3, (%r8) 1507; AVX512BW-NEXT: vmovdqa %ymm6, (%r9) 1508; AVX512BW-NEXT: vmovdqa %ymm4, (%rax) 1509; AVX512BW-NEXT: vzeroupper 1510; AVX512BW-NEXT: retq 1511; 1512; AVX512BW-FCP-LABEL: load_i32_stride6_vf8: 1513; AVX512BW-FCP: # %bb.0: 1514; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1515; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 1516; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 1517; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1518; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] 1519; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 1520; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 1521; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 1522; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1523; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] 1524; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 1525; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] 1526; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1527; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] 1528; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 1529; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] 1530; AVX512BW-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2 1531; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] 1532; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1533; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] 1534; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] 1535; AVX512BW-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3 1536; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] 1537; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 1538; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] 1539; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 1540; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] 1541; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 1542; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] 1543; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 1544; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] 1545; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 1546; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] 1547; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 1548; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rsi) 1549; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%rdx) 1550; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rcx) 1551; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%r8) 1552; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r9) 1553; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rax) 1554; AVX512BW-FCP-NEXT: vzeroupper 1555; AVX512BW-FCP-NEXT: retq 1556; 1557; AVX512DQ-BW-LABEL: load_i32_stride6_vf8: 1558; AVX512DQ-BW: # %bb.0: 1559; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1560; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm0 1561; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm1 1562; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1563; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] 1564; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4 1565; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm5 1566; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm6 1567; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1568; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] 1569; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 1570; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] 1571; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1572; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] 1573; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 1574; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] 1575; AVX512DQ-BW-NEXT: vpermd %zmm6, %zmm2, %zmm2 1576; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] 1577; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1578; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] 1579; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] 1580; AVX512DQ-BW-NEXT: vpermd %zmm6, %zmm3, %zmm3 1581; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] 1582; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 1583; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] 1584; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 1585; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] 1586; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 1587; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] 1588; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 1589; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] 1590; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 1591; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] 1592; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 1593; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%rsi) 1594; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%rdx) 1595; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rcx) 1596; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%r8) 1597; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%r9) 1598; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rax) 1599; AVX512DQ-BW-NEXT: vzeroupper 1600; AVX512DQ-BW-NEXT: retq 1601; 1602; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf8: 1603; AVX512DQ-BW-FCP: # %bb.0: 1604; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1605; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 1606; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 1607; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1608; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] 1609; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 1610; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 1611; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 1612; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1613; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] 1614; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 1615; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] 1616; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1617; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] 1618; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 1619; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] 1620; AVX512DQ-BW-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2 1621; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] 1622; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 1623; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] 1624; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] 1625; AVX512DQ-BW-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3 1626; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] 1627; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 1628; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] 1629; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 1630; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] 1631; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 1632; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] 1633; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 1634; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] 1635; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 1636; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] 1637; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 1638; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rsi) 1639; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%rdx) 1640; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rcx) 1641; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%r8) 1642; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r9) 1643; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rax) 1644; AVX512DQ-BW-FCP-NEXT: vzeroupper 1645; AVX512DQ-BW-FCP-NEXT: retq 1646 %wide.vec = load <48 x i32>, ptr %in.vec, align 64 1647 %strided.vec0 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <8 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42> 1648 %strided.vec1 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <8 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43> 1649 %strided.vec2 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <8 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44> 1650 %strided.vec3 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <8 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45> 1651 %strided.vec4 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <8 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46> 1652 %strided.vec5 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <8 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47> 1653 store <8 x i32> %strided.vec0, ptr %out.vec0, align 64 1654 store <8 x i32> %strided.vec1, ptr %out.vec1, align 64 1655 store <8 x i32> %strided.vec2, ptr %out.vec2, align 64 1656 store <8 x i32> %strided.vec3, ptr %out.vec3, align 64 1657 store <8 x i32> %strided.vec4, ptr %out.vec4, align 64 1658 store <8 x i32> %strided.vec5, ptr %out.vec5, align 64 1659 ret void 1660} 1661 1662define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 1663; SSE-LABEL: load_i32_stride6_vf16: 1664; SSE: # %bb.0: 1665; SSE-NEXT: subq $408, %rsp # imm = 0x198 1666; SSE-NEXT: movdqa 240(%rdi), %xmm9 1667; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1668; SSE-NEXT: movdqa 256(%rdi), %xmm3 1669; SSE-NEXT: movdqa 192(%rdi), %xmm10 1670; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1671; SSE-NEXT: movdqa 208(%rdi), %xmm4 1672; SSE-NEXT: movdqa 336(%rdi), %xmm14 1673; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1674; SSE-NEXT: movdqa 352(%rdi), %xmm5 1675; SSE-NEXT: movdqa 288(%rdi), %xmm15 1676; SSE-NEXT: movdqa 304(%rdi), %xmm7 1677; SSE-NEXT: movdqa 64(%rdi), %xmm12 1678; SSE-NEXT: movdqa (%rdi), %xmm8 1679; SSE-NEXT: movdqa 16(%rdi), %xmm11 1680; SSE-NEXT: movdqa 48(%rdi), %xmm13 1681; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] 1682; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1683; SSE-NEXT: movdqa %xmm8, %xmm1 1684; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1685; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1686; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] 1687; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1688; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] 1689; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 1690; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 1691; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1692; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] 1693; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1694; SSE-NEXT: movdqa %xmm15, %xmm1 1695; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1696; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] 1697; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1698; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,1,1] 1699; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 1700; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 1701; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1702; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] 1703; SSE-NEXT: movdqa %xmm4, %xmm14 1704; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1705; SSE-NEXT: movdqa %xmm10, %xmm1 1706; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1707; SSE-NEXT: movdqa %xmm3, %xmm2 1708; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1709; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] 1710; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] 1711; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 1712; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] 1713; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1714; SSE-NEXT: movdqa 96(%rdi), %xmm4 1715; SSE-NEXT: movdqa 112(%rdi), %xmm10 1716; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] 1717; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1718; SSE-NEXT: movdqa %xmm4, %xmm1 1719; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1720; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1721; SSE-NEXT: movdqa 144(%rdi), %xmm9 1722; SSE-NEXT: movdqa 160(%rdi), %xmm6 1723; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] 1724; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1725; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] 1726; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 1727; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] 1728; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1729; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] 1730; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] 1731; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1732; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] 1733; SSE-NEXT: movdqa %xmm13, %xmm3 1734; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 1735; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] 1736; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1737; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] 1738; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] 1739; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1740; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] 1741; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 1742; SSE-NEXT: movdqa %xmm5, %xmm3 1743; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 1744; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] 1745; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1746; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 1747; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] 1748; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] 1749; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1750; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 1751; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 1752; SSE-NEXT: movdqa %xmm3, %xmm2 1753; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1754; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 1755; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1756; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] 1757; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] 1758; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1759; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] 1760; SSE-NEXT: movdqa %xmm9, %xmm14 1761; SSE-NEXT: movdqa %xmm9, %xmm2 1762; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1763; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 1764; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1765; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] 1766; SSE-NEXT: movdqa 80(%rdi), %xmm10 1767; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,1,1] 1768; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1769; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 1770; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] 1771; SSE-NEXT: movdqa 32(%rdi), %xmm6 1772; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 1773; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 1774; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1775; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] 1776; SSE-NEXT: movdqa 368(%rdi), %xmm0 1777; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1778; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 1779; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1780; SSE-NEXT: movdqa %xmm15, %xmm9 1781; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] 1782; SSE-NEXT: movdqa 320(%rdi), %xmm8 1783; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] 1784; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 1785; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1786; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] 1787; SSE-NEXT: movdqa 272(%rdi), %xmm15 1788; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1] 1789; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1790; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] 1791; SSE-NEXT: movdqa 224(%rdi), %xmm3 1792; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 1793; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 1794; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1795; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] 1796; SSE-NEXT: movdqa 176(%rdi), %xmm11 1797; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,1,1] 1798; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1799; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 1800; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] 1801; SSE-NEXT: movdqa 128(%rdi), %xmm4 1802; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 1803; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 1804; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1805; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] 1806; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1807; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] 1808; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1809; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] 1810; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1811; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] 1812; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 1813; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1814; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] 1815; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] 1816; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1817; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1818; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 1819; SSE-NEXT: # xmm1 = mem[2,3,2,3] 1820; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1821; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1822; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 1823; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1824; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] 1825; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] 1826; SSE-NEXT: movdqa %xmm3, %xmm9 1827; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill 1828; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1829; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 1830; SSE-NEXT: # xmm12 = mem[2,3,2,3] 1831; SSE-NEXT: movdqa %xmm15, %xmm3 1832; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1833; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] 1834; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] 1835; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] 1836; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] 1837; SSE-NEXT: movdqa %xmm4, %xmm7 1838; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1839; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1840; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[2,3,2,3] 1841; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] 1842; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1843; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] 1844; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] 1845; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 1846; SSE-NEXT: movdqa %xmm6, %xmm1 1847; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1848; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] 1849; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 1850; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,0,1,1] 1851; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 1852; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] 1853; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] 1854; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 1855; SSE-NEXT: movdqa %xmm14, %xmm1 1856; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1857; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] 1858; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 1859; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,1,1] 1860; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 1861; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] 1862; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] 1863; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1864; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1865; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] 1866; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 1867; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] 1868; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 1869; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] 1870; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] 1871; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 1872; SSE-NEXT: movdqa %xmm7, %xmm1 1873; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1874; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] 1875; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 1876; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] 1877; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 1878; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 1879; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] 1880; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 1881; SSE-NEXT: # xmm1 = mem[3,3,3,3] 1882; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1883; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 1884; SSE-NEXT: # xmm1 = mem[2,3,2,3] 1885; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] 1886; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] 1887; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] 1888; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 1889; SSE-NEXT: # xmm1 = mem[3,3,3,3] 1890; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1891; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 1892; SSE-NEXT: # xmm1 = mem[2,3,2,3] 1893; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] 1894; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] 1895; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1896; SSE-NEXT: # xmm0 = mem[1,1,1,1] 1897; SSE-NEXT: pshufd $255, (%rsp), %xmm1 # 16-byte Folded Reload 1898; SSE-NEXT: # xmm1 = mem[3,3,3,3] 1899; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1900; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 1901; SSE-NEXT: # xmm1 = mem[2,3,2,3] 1902; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] 1903; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] 1904; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] 1905; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 1906; SSE-NEXT: # xmm1 = mem[3,3,3,3] 1907; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1908; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 1909; SSE-NEXT: # xmm1 = mem[2,3,2,3] 1910; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] 1911; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] 1912; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1913; SSE-NEXT: movaps %xmm0, 16(%rsi) 1914; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1915; SSE-NEXT: movaps %xmm0, 32(%rsi) 1916; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1917; SSE-NEXT: movaps %xmm0, 48(%rsi) 1918; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1919; SSE-NEXT: movaps %xmm0, (%rsi) 1920; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1921; SSE-NEXT: movaps %xmm0, 16(%rdx) 1922; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1923; SSE-NEXT: movaps %xmm0, 32(%rdx) 1924; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1925; SSE-NEXT: movaps %xmm0, 48(%rdx) 1926; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1927; SSE-NEXT: movaps %xmm0, (%rdx) 1928; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1929; SSE-NEXT: movaps %xmm0, 16(%rcx) 1930; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1931; SSE-NEXT: movaps %xmm0, 32(%rcx) 1932; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1933; SSE-NEXT: movaps %xmm0, 48(%rcx) 1934; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1935; SSE-NEXT: movaps %xmm0, (%rcx) 1936; SSE-NEXT: movapd %xmm15, 16(%r8) 1937; SSE-NEXT: movapd %xmm12, 32(%r8) 1938; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1939; SSE-NEXT: movaps %xmm0, 48(%r8) 1940; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1941; SSE-NEXT: movaps %xmm0, (%r8) 1942; SSE-NEXT: movapd %xmm2, 16(%r9) 1943; SSE-NEXT: movapd %xmm3, 32(%r9) 1944; SSE-NEXT: movapd %xmm4, 48(%r9) 1945; SSE-NEXT: movapd %xmm5, (%r9) 1946; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 1947; SSE-NEXT: movapd %xmm13, 16(%rax) 1948; SSE-NEXT: movapd %xmm9, 32(%rax) 1949; SSE-NEXT: movapd %xmm8, 48(%rax) 1950; SSE-NEXT: movapd %xmm10, (%rax) 1951; SSE-NEXT: addq $408, %rsp # imm = 0x198 1952; SSE-NEXT: retq 1953; 1954; AVX-LABEL: load_i32_stride6_vf16: 1955; AVX: # %bb.0: 1956; AVX-NEXT: subq $328, %rsp # imm = 0x148 1957; AVX-NEXT: vmovaps 224(%rdi), %ymm12 1958; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1959; AVX-NEXT: vmovaps 288(%rdi), %ymm10 1960; AVX-NEXT: vmovaps 256(%rdi), %ymm4 1961; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1962; AVX-NEXT: vmovapd 160(%rdi), %ymm1 1963; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1964; AVX-NEXT: vmovapd 128(%rdi), %ymm13 1965; AVX-NEXT: vmovaps 32(%rdi), %ymm6 1966; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1967; AVX-NEXT: vmovaps (%rdi), %ymm14 1968; AVX-NEXT: vmovaps 96(%rdi), %ymm9 1969; AVX-NEXT: vmovaps 64(%rdi), %ymm0 1970; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1971; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm2 1972; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm9[2,0],ymm2[0,0],ymm9[6,4],ymm2[4,4] 1973; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm2[2,2],ymm5[6,4],ymm2[6,6] 1974; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5],ymm14[6,7] 1975; AVX-NEXT: vextractf128 $1, %ymm6, %xmm7 1976; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm6[0,1],xmm7[2,3] 1977; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,2],xmm7[0,3] 1978; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7] 1979; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm13[2,3],ymm1[0,1] 1980; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm8[0],ymm13[1],ymm8[3],ymm13[2] 1981; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] 1982; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm11[6,7] 1983; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1984; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm4, %ymm1 1985; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm10[2,0],ymm1[0,0],ymm10[6,4],ymm1[4,4] 1986; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm1[2,2],ymm5[6,4],ymm1[6,6] 1987; AVX-NEXT: vmovaps 192(%rdi), %ymm15 1988; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm12[4,5],ymm15[6,7] 1989; AVX-NEXT: vextractf128 $1, %ymm3, %xmm0 1990; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm3[0,1],xmm0[2,3] 1991; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,3] 1992; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7] 1993; AVX-NEXT: vmovapd 352(%rdi), %ymm4 1994; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1995; AVX-NEXT: vmovapd 320(%rdi), %ymm12 1996; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[0,1] 1997; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm4[0],ymm12[1],ymm4[3],ymm12[2] 1998; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] 1999; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] 2000; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2001; AVX-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill 2002; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm9[3,0],ymm2[1,0],ymm9[7,4],ymm2[5,4] 2003; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm11[2,0],ymm2[2,3],ymm11[6,4],ymm2[6,7] 2004; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,0],xmm7[3,0] 2005; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[1,3] 2006; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7] 2007; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm8[3,1],ymm13[1,3],ymm8[7,5],ymm13[5,7] 2008; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] 2009; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] 2010; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2011; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2012; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm10[3,0],ymm1[1,0],ymm10[7,4],ymm1[5,4] 2013; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[2,3],ymm2[6,4],ymm1[6,7] 2014; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[1,0],xmm0[3,0] 2015; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] 2016; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 2017; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1],ymm12[1,3],ymm4[7,5],ymm12[5,7] 2018; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 2019; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 2020; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2021; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload 2022; AVX-NEXT: # ymm2 = mem[0,1],ymm14[2,3],mem[4,5,6,7] 2023; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 2024; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm9[2,1],ymm7[2,0],ymm9[6,5],ymm7[6,4] 2025; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] 2026; AVX-NEXT: vextractf128 $1, %ymm2, %xmm5 2027; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,0],xmm5[2,3] 2028; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] 2029; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 2030; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm9[4,5],ymm13[6,7] 2031; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[2,3,0,1] 2032; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0],ymm4[2,0],ymm8[4,4],ymm4[6,4] 2033; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm3[5,6,7] 2034; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2035; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload 2036; AVX-NEXT: # ymm11 = mem[0,1],ymm15[2,3],mem[4,5,6,7] 2037; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 2038; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[2,1],ymm3[2,0],ymm10[6,5],ymm3[6,4] 2039; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] 2040; AVX-NEXT: vextractf128 $1, %ymm11, %xmm15 2041; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm11[2,0],xmm15[2,3] 2042; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7] 2043; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 2044; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] 2045; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] 2046; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm0[0,0],ymm14[2,0],ymm0[4,4],ymm14[6,4] 2047; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] 2048; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2049; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm4[3,1],ymm8[4,5],ymm4[7,5] 2050; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1],xmm5[3,3] 2051; AVX-NEXT: vmovaps %ymm7, %ymm1 2052; AVX-NEXT: vmovups (%rsp), %ymm7 # 32-byte Reload 2053; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm7[3,1],ymm1[2,1],ymm7[7,5],ymm1[6,5] 2054; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] 2055; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] 2056; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] 2057; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2058; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm14[3,1],ymm0[4,5],ymm14[7,5] 2059; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm11[3,1],xmm15[3,3] 2060; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2061; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[3,1],ymm3[2,1],ymm4[7,5],ymm3[6,5] 2062; AVX-NEXT: vmovaps %ymm3, %ymm15 2063; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] 2064; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] 2065; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] 2066; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2067; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5,6,7] 2068; AVX-NEXT: vmovaps 32(%rdi), %xmm3 2069; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,2,3,3] 2070; AVX-NEXT: vmovaps 16(%rdi), %xmm5 2071; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] 2072; AVX-NEXT: vmovapd 80(%rdi), %xmm6 2073; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm6[1],ymm1[0],ymm6[2],ymm1[3] 2074; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm7[0,1],ymm8[2,0],ymm7[4,5],ymm8[6,4] 2075; AVX-NEXT: vmovaps %ymm7, %ymm13 2076; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3,4,5,6,7] 2077; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3,0,1] 2078; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm9[2,0],ymm8[0,0],ymm9[6,4],ymm8[4,4] 2079; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm8[0,2],ymm11[2,0],ymm8[4,6],ymm11[6,4] 2080; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] 2081; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] 2082; AVX-NEXT: vmovaps 224(%rdi), %xmm12 2083; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm12[2,2,3,3] 2084; AVX-NEXT: vmovaps 208(%rdi), %xmm0 2085; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0],xmm14[1],xmm0[2,3] 2086; AVX-NEXT: vmovapd 272(%rdi), %xmm1 2087; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm1[1],ymm15[0],ymm1[2],ymm15[3] 2088; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm7[2,0],ymm4[4,5],ymm7[6,4] 2089; AVX-NEXT: vmovaps %ymm4, %ymm10 2090; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7] 2091; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm11[2,3,0,1] 2092; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm11[2,0],ymm14[0,0],ymm11[6,4],ymm14[4,4] 2093; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm4[2,0],ymm14[4,6],ymm4[6,4] 2094; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] 2095; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm9[3,0],ymm8[1,0],ymm9[7,4],ymm8[5,4] 2096; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,3],ymm7[2,0],ymm8[4,7],ymm7[6,4] 2097; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] 2098; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload 2099; AVX-NEXT: # ymm5 = ymm6[3,1],mem[1,3],ymm6[7,5],mem[5,7] 2100; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm13[1,1],ymm5[2,0],ymm13[5,5],ymm5[6,4] 2101; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3] 2102; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] 2103; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7] 2104; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm11[3,0],ymm14[1,0],ymm11[7,4],ymm14[5,4] 2105; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,3],ymm5[2,0],ymm14[4,7],ymm5[6,4] 2106; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3] 2107; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1],ymm15[1,3],ymm1[7,5],ymm15[5,7] 2108; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm1[2,0],ymm10[5,5],ymm1[6,4] 2109; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] 2110; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 2111; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] 2112; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2113; AVX-NEXT: vmovaps %ymm1, 32(%rsi) 2114; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2115; AVX-NEXT: vmovaps %ymm1, (%rsi) 2116; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2117; AVX-NEXT: vmovaps %ymm1, 32(%rdx) 2118; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2119; AVX-NEXT: vmovaps %ymm1, (%rdx) 2120; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2121; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 2122; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2123; AVX-NEXT: vmovaps %ymm1, (%rcx) 2124; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2125; AVX-NEXT: vmovaps %ymm1, 32(%r8) 2126; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2127; AVX-NEXT: vmovaps %ymm1, (%r8) 2128; AVX-NEXT: vmovaps %ymm4, 32(%r9) 2129; AVX-NEXT: vmovaps %ymm2, (%r9) 2130; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 2131; AVX-NEXT: vmovaps %ymm0, 32(%rax) 2132; AVX-NEXT: vmovaps %ymm3, (%rax) 2133; AVX-NEXT: addq $328, %rsp # imm = 0x148 2134; AVX-NEXT: vzeroupper 2135; AVX-NEXT: retq 2136; 2137; AVX2-LABEL: load_i32_stride6_vf16: 2138; AVX2: # %bb.0: 2139; AVX2-NEXT: subq $392, %rsp # imm = 0x188 2140; AVX2-NEXT: vmovaps 288(%rdi), %ymm10 2141; AVX2-NEXT: vmovaps 224(%rdi), %ymm1 2142; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2143; AVX2-NEXT: vmovaps 192(%rdi), %ymm2 2144; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2145; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 2146; AVX2-NEXT: vmovaps 160(%rdi), %ymm3 2147; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2148; AVX2-NEXT: vmovaps 96(%rdi), %ymm15 2149; AVX2-NEXT: vmovaps (%rdi), %ymm4 2150; AVX2-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill 2151; AVX2-NEXT: vmovaps 32(%rdi), %ymm5 2152; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2153; AVX2-NEXT: vmovaps 64(%rdi), %ymm13 2154; AVX2-NEXT: vmovaps {{.*#+}} xmm6 = [0,6,4,u] 2155; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] 2156; AVX2-NEXT: vpermps %ymm8, %ymm6, %ymm7 2157; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm13[0,1],ymm15[0,1] 2158; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7] 2159; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6] 2160; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7] 2161; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm0[4,5,6,7] 2162; AVX2-NEXT: vmovaps %ymm0, %ymm7 2163; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2164; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm12 = [4,2,4,2,4,2,4,2] 2165; AVX2-NEXT: vpermps %ymm4, %ymm12, %ymm14 2166; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2167; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm14[6,7] 2168; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2169; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] 2170; AVX2-NEXT: vpermps %ymm3, %ymm6, %ymm0 2171; AVX2-NEXT: vmovaps 256(%rdi), %ymm11 2172; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2173; AVX2-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm11[0,1],ymm10[0,1] 2174; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm10[6,7] 2175; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,2,2,2,4,6,6,6] 2176; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm14[3,4,5,6,7] 2177; AVX2-NEXT: vmovaps 320(%rdi), %ymm5 2178; AVX2-NEXT: vmovaps 352(%rdi), %ymm6 2179; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] 2180; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2181; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2182; AVX2-NEXT: vpermps %ymm0, %ymm12, %ymm10 2183; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] 2184; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2185; AVX2-NEXT: vmovaps {{.*#+}} xmm2 = [1,7,5,u] 2186; AVX2-NEXT: vpermps %ymm8, %ymm2, %ymm8 2187; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] 2188; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] 2189; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm14 = [5,3,5,3,5,3,5,3] 2190; AVX2-NEXT: vpermps %ymm4, %ymm14, %ymm4 2191; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] 2192; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2193; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm2 2194; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7] 2195; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 2196; AVX2-NEXT: vpermps %ymm0, %ymm14, %ymm0 2197; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 2198; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2199; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] 2200; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 2201; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 2202; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 2203; AVX2-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload 2204; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] 2205; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0,2,3,6,4,6,7] 2206; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 2207; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 2208; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 2209; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] 2210; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,0,2,0,4,4,6,4] 2211; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 2212; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] 2213; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2214; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 2215; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7] 2216; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 2217; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 2218; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 2219; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 2220; AVX2-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] 2221; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm3[2,0,2,3,6,4,6,7] 2222; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] 2223; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7] 2224; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] 2225; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,0,2,0,4,4,6,4] 2226; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] 2227; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] 2228; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2229; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm15[3,3,3,3,7,7,7,7] 2230; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] 2231; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 2232; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1,3,3,7,5,7,7] 2233; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] 2234; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 2235; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,1,4,5,7,5] 2236; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] 2237; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 2238; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] 2239; AVX2-NEXT: vmovaps %ymm7, %ymm6 2240; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] 2241; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] 2242; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1,3,3,7,5,7,7] 2243; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 2244; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 2245; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,7,5] 2246; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 2247; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm2[5,6,7] 2248; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm15[4,5,6,7] 2249; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm9[4,5,6,7] 2250; AVX2-NEXT: vmovaps 80(%rdi), %xmm4 2251; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] 2252; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] 2253; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 2254; AVX2-NEXT: vpermps %ymm3, %ymm7, %ymm5 2255; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] 2256; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload 2257; AVX2-NEXT: # ymm5 = ymm12[0,1],mem[2,3],ymm12[4,5,6,7] 2258; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] 2259; AVX2-NEXT: # ymm10 = mem[0,1,0,1] 2260; AVX2-NEXT: vpermps %ymm5, %ymm10, %ymm13 2261; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm13[5,6,7] 2262; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] 2263; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 2264; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload 2265; AVX2-NEXT: # ymm11 = ymm9[0,1,2,3],mem[4,5,6,7] 2266; AVX2-NEXT: vmovaps 272(%rdi), %xmm13 2267; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] 2268; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,0,2,4,5,4,6] 2269; AVX2-NEXT: vpermps %ymm11, %ymm7, %ymm7 2270; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7] 2271; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 2272; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload 2273; AVX2-NEXT: # ymm12 = mem[0,1],ymm9[2,3],mem[4,5,6,7] 2274; AVX2-NEXT: vpermps %ymm12, %ymm10, %ymm10 2275; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7] 2276; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] 2277; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] 2278; AVX2-NEXT: vpermps %ymm3, %ymm14, %ymm3 2279; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] 2280; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] 2281; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 2282; AVX2-NEXT: vpermps %ymm5, %ymm3, %ymm4 2283; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] 2284; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,1,1,1,5,5,5,5] 2285; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5,6,7] 2286; AVX2-NEXT: vpermps %ymm11, %ymm14, %ymm5 2287; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] 2288; AVX2-NEXT: vpermps %ymm12, %ymm3, %ymm3 2289; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] 2290; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2291; AVX2-NEXT: vmovaps %ymm4, 32(%rsi) 2292; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2293; AVX2-NEXT: vmovaps %ymm4, (%rsi) 2294; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2295; AVX2-NEXT: vmovaps %ymm4, 32(%rdx) 2296; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2297; AVX2-NEXT: vmovaps %ymm4, (%rdx) 2298; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2299; AVX2-NEXT: vmovaps %ymm4, 32(%rcx) 2300; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2301; AVX2-NEXT: vmovaps %ymm4, (%rcx) 2302; AVX2-NEXT: vmovaps %ymm8, 32(%r8) 2303; AVX2-NEXT: vmovaps %ymm0, (%r8) 2304; AVX2-NEXT: vmovaps %ymm7, 32(%r9) 2305; AVX2-NEXT: vmovaps %ymm1, (%r9) 2306; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 2307; AVX2-NEXT: vmovaps %ymm3, 32(%rax) 2308; AVX2-NEXT: vmovaps %ymm2, (%rax) 2309; AVX2-NEXT: addq $392, %rsp # imm = 0x188 2310; AVX2-NEXT: vzeroupper 2311; AVX2-NEXT: retq 2312; 2313; AVX2-FP-LABEL: load_i32_stride6_vf16: 2314; AVX2-FP: # %bb.0: 2315; AVX2-FP-NEXT: subq $392, %rsp # imm = 0x188 2316; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm10 2317; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm1 2318; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2319; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm2 2320; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2321; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0 2322; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm3 2323; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2324; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm15 2325; AVX2-FP-NEXT: vmovaps (%rdi), %ymm4 2326; AVX2-FP-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill 2327; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm5 2328; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2329; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm13 2330; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm6 = [0,6,4,u] 2331; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] 2332; AVX2-FP-NEXT: vpermps %ymm8, %ymm6, %ymm7 2333; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm13[0,1],ymm15[0,1] 2334; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7] 2335; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6] 2336; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7] 2337; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm0[4,5,6,7] 2338; AVX2-FP-NEXT: vmovaps %ymm0, %ymm7 2339; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2340; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm12 = [4,2,4,2,4,2,4,2] 2341; AVX2-FP-NEXT: vpermps %ymm4, %ymm12, %ymm14 2342; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2343; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm14[6,7] 2344; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2345; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] 2346; AVX2-FP-NEXT: vpermps %ymm3, %ymm6, %ymm0 2347; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm11 2348; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2349; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm11[0,1],ymm10[0,1] 2350; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm10[6,7] 2351; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,2,2,2,4,6,6,6] 2352; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm14[3,4,5,6,7] 2353; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm5 2354; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm6 2355; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] 2356; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2357; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2358; AVX2-FP-NEXT: vpermps %ymm0, %ymm12, %ymm10 2359; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] 2360; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2361; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm2 = [1,7,5,u] 2362; AVX2-FP-NEXT: vpermps %ymm8, %ymm2, %ymm8 2363; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] 2364; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] 2365; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm14 = [5,3,5,3,5,3,5,3] 2366; AVX2-FP-NEXT: vpermps %ymm4, %ymm14, %ymm4 2367; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] 2368; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2369; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm2 2370; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7] 2371; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 2372; AVX2-FP-NEXT: vpermps %ymm0, %ymm14, %ymm0 2373; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 2374; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2375; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] 2376; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 2377; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 2378; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 2379; AVX2-FP-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload 2380; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] 2381; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0,2,3,6,4,6,7] 2382; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 2383; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 2384; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 2385; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] 2386; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,0,2,0,4,4,6,4] 2387; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 2388; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] 2389; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2390; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 2391; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7] 2392; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 2393; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 2394; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 2395; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 2396; AVX2-FP-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] 2397; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm3[2,0,2,3,6,4,6,7] 2398; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] 2399; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7] 2400; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] 2401; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,0,2,0,4,4,6,4] 2402; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] 2403; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] 2404; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2405; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm15[3,3,3,3,7,7,7,7] 2406; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] 2407; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 2408; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1,3,3,7,5,7,7] 2409; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] 2410; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 2411; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,1,4,5,7,5] 2412; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] 2413; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 2414; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] 2415; AVX2-FP-NEXT: vmovaps %ymm7, %ymm6 2416; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] 2417; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] 2418; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1,3,3,7,5,7,7] 2419; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 2420; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 2421; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,7,5] 2422; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 2423; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm2[5,6,7] 2424; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm15[4,5,6,7] 2425; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm9[4,5,6,7] 2426; AVX2-FP-NEXT: vmovaps 80(%rdi), %xmm4 2427; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] 2428; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] 2429; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 2430; AVX2-FP-NEXT: vpermps %ymm3, %ymm7, %ymm5 2431; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] 2432; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload 2433; AVX2-FP-NEXT: # ymm5 = ymm12[0,1],mem[2,3],ymm12[4,5,6,7] 2434; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] 2435; AVX2-FP-NEXT: # ymm10 = mem[0,1,0,1] 2436; AVX2-FP-NEXT: vpermps %ymm5, %ymm10, %ymm13 2437; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm13[5,6,7] 2438; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] 2439; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 2440; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload 2441; AVX2-FP-NEXT: # ymm11 = ymm9[0,1,2,3],mem[4,5,6,7] 2442; AVX2-FP-NEXT: vmovaps 272(%rdi), %xmm13 2443; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] 2444; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,0,2,4,5,4,6] 2445; AVX2-FP-NEXT: vpermps %ymm11, %ymm7, %ymm7 2446; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7] 2447; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 2448; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload 2449; AVX2-FP-NEXT: # ymm12 = mem[0,1],ymm9[2,3],mem[4,5,6,7] 2450; AVX2-FP-NEXT: vpermps %ymm12, %ymm10, %ymm10 2451; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7] 2452; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] 2453; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] 2454; AVX2-FP-NEXT: vpermps %ymm3, %ymm14, %ymm3 2455; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] 2456; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] 2457; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] 2458; AVX2-FP-NEXT: vpermps %ymm5, %ymm3, %ymm4 2459; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] 2460; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,1,1,1,5,5,5,5] 2461; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5,6,7] 2462; AVX2-FP-NEXT: vpermps %ymm11, %ymm14, %ymm5 2463; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] 2464; AVX2-FP-NEXT: vpermps %ymm12, %ymm3, %ymm3 2465; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] 2466; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2467; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi) 2468; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2469; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) 2470; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2471; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx) 2472; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2473; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx) 2474; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2475; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx) 2476; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2477; AVX2-FP-NEXT: vmovaps %ymm4, (%rcx) 2478; AVX2-FP-NEXT: vmovaps %ymm8, 32(%r8) 2479; AVX2-FP-NEXT: vmovaps %ymm0, (%r8) 2480; AVX2-FP-NEXT: vmovaps %ymm7, 32(%r9) 2481; AVX2-FP-NEXT: vmovaps %ymm1, (%r9) 2482; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2483; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rax) 2484; AVX2-FP-NEXT: vmovaps %ymm2, (%rax) 2485; AVX2-FP-NEXT: addq $392, %rsp # imm = 0x188 2486; AVX2-FP-NEXT: vzeroupper 2487; AVX2-FP-NEXT: retq 2488; 2489; AVX2-FCP-LABEL: load_i32_stride6_vf16: 2490; AVX2-FCP: # %bb.0: 2491; AVX2-FCP-NEXT: subq $360, %rsp # imm = 0x168 2492; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm10 2493; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm6 2494; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2495; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm2 2496; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2497; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm0 2498; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2499; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm13 2500; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1 2501; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm4 2502; AVX2-FCP-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill 2503; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm5 2504; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2505; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm15 2506; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm12 = [0,6,4,u] 2507; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] 2508; AVX2-FCP-NEXT: vpermps %ymm8, %ymm12, %ymm7 2509; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm5 2510; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm15[0,1],ymm1[0,1] 2511; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm1[6,7] 2512; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2513; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6] 2514; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7] 2515; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm0[4,5,6,7] 2516; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4,2,4,2,4,2,4,2] 2517; AVX2-FCP-NEXT: vpermps %ymm4, %ymm1, %ymm14 2518; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2519; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm14[6,7] 2520; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2521; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] 2522; AVX2-FCP-NEXT: vpermps %ymm3, %ymm12, %ymm12 2523; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm11 2524; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2525; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm11[0,1],ymm10[0,1] 2526; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5],ymm10[6,7] 2527; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm6[0,2,2,2,4,6,6,6] 2528; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2],ymm14[3,4,5,6,7] 2529; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm7 2530; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm12 2531; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm7[4,5,6,7] 2532; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2533; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm10 2534; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] 2535; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2536; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [1,7,5,u] 2537; AVX2-FCP-NEXT: vpermps %ymm8, %ymm2, %ymm8 2538; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] 2539; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] 2540; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm14 = [5,3,5,3,5,3,5,3] 2541; AVX2-FCP-NEXT: vpermps %ymm4, %ymm14, %ymm4 2542; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] 2543; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2544; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm2 2545; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,3,2,3,5,7,6,7] 2546; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 2547; AVX2-FCP-NEXT: vpermps %ymm0, %ymm14, %ymm0 2548; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 2549; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2550; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm5[2,3],ymm15[4,5],ymm5[6,7] 2551; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm1 = [2,0,6,4,2,0,6,7] 2552; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0 2553; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [2,0,6,7] 2554; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 2555; AVX2-FCP-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload 2556; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm9[2,3],ymm5[4,5,6,7] 2557; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm4 2558; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7] 2559; AVX2-FCP-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload 2560; AVX2-FCP-NEXT: # ymm4 = mem[0,1,2,3],ymm13[4,5],mem[6,7] 2561; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [0,0,6,4,0,0,6,4] 2562; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] 2563; AVX2-FCP-NEXT: vpermps %ymm4, %ymm8, %ymm10 2564; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] 2565; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2566; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 2567; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] 2568; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0 2569; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2570; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 2571; AVX2-FCP-NEXT: # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] 2572; AVX2-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 2573; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 2574; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] 2575; AVX2-FCP-NEXT: vpermps %ymm2, %ymm8, %ymm8 2576; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] 2577; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2578; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,3,3] 2579; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] 2580; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3,4],ymm0[5],ymm3[6,7] 2581; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,1,7,5,0,1,7,5] 2582; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] 2583; AVX2-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm4 2584; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm8 = [3,1,7,5,0,u,u,u] 2585; AVX2-FCP-NEXT: vpermps %ymm0, %ymm8, %ymm0 2586; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] 2587; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm4 = mem[3,3,3,3] 2588; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] 2589; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4],ymm4[5],ymm1[6,7] 2590; AVX2-FCP-NEXT: vpermps %ymm1, %ymm8, %ymm1 2591; AVX2-FCP-NEXT: vpermps %ymm2, %ymm3, %ymm2 2592; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm2[5,6,7] 2593; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload 2594; AVX2-FCP-NEXT: # ymm2 = ymm15[0,1,2,3],mem[4,5,6,7] 2595; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm9[4,5,6,7] 2596; AVX2-FCP-NEXT: vmovaps 80(%rdi), %xmm4 2597; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] 2598; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] 2599; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 2600; AVX2-FCP-NEXT: vpermps %ymm3, %ymm7, %ymm5 2601; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] 2602; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload 2603; AVX2-FCP-NEXT: # ymm5 = ymm13[0,1],mem[2,3],ymm13[4,5,6,7] 2604; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] 2605; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] 2606; AVX2-FCP-NEXT: vpermps %ymm5, %ymm10, %ymm13 2607; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm13[5,6,7] 2608; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] 2609; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 2610; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload 2611; AVX2-FCP-NEXT: # ymm11 = ymm9[0,1,2,3],mem[4,5,6,7] 2612; AVX2-FCP-NEXT: vmovaps 272(%rdi), %xmm13 2613; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] 2614; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,0,2,4,5,4,6] 2615; AVX2-FCP-NEXT: vpermps %ymm11, %ymm7, %ymm7 2616; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7] 2617; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload 2618; AVX2-FCP-NEXT: # ymm12 = ymm12[0,1],mem[2,3],ymm12[4,5,6,7] 2619; AVX2-FCP-NEXT: vpermps %ymm12, %ymm10, %ymm10 2620; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7] 2621; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] 2622; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] 2623; AVX2-FCP-NEXT: vpermps %ymm3, %ymm14, %ymm3 2624; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] 2625; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] 2626; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] 2627; AVX2-FCP-NEXT: vpermps %ymm5, %ymm3, %ymm4 2628; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] 2629; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,1,1,1,5,5,5,5] 2630; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5,6,7] 2631; AVX2-FCP-NEXT: vpermps %ymm11, %ymm14, %ymm5 2632; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] 2633; AVX2-FCP-NEXT: vpermps %ymm12, %ymm3, %ymm3 2634; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] 2635; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2636; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi) 2637; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2638; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) 2639; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2640; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rdx) 2641; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2642; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx) 2643; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2644; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rcx) 2645; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2646; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx) 2647; AVX2-FCP-NEXT: vmovaps %ymm8, 32(%r8) 2648; AVX2-FCP-NEXT: vmovaps %ymm0, (%r8) 2649; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%r9) 2650; AVX2-FCP-NEXT: vmovaps %ymm1, (%r9) 2651; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2652; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rax) 2653; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax) 2654; AVX2-FCP-NEXT: addq $360, %rsp # imm = 0x168 2655; AVX2-FCP-NEXT: vzeroupper 2656; AVX2-FCP-NEXT: retq 2657; 2658; AVX512-LABEL: load_i32_stride6_vf16: 2659; AVX512: # %bb.0: 2660; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 2661; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm0 2662; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm1 2663; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3 2664; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm4 2665; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm5 2666; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm6 2667; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 2668; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2669; AVX512-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 2670; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] 2671; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 2672; AVX512-NEXT: movb $56, %dil 2673; AVX512-NEXT: kmovw %edi, %k2 2674; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} 2675; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 2676; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2677; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 2678; AVX512-NEXT: movw $-2048, %di # imm = 0xF800 2679; AVX512-NEXT: kmovw %edi, %k1 2680; AVX512-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} 2681; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 2682; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2683; AVX512-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 2684; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] 2685; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 2686; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} 2687; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 2688; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2689; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 2690; AVX512-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} 2691; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] 2692; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 2693; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 2694; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 2695; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 2696; AVX512-NEXT: movw $31, %di 2697; AVX512-NEXT: kmovw %edi, %k2 2698; AVX512-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} 2699; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 2700; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2701; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 2702; AVX512-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} 2703; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] 2704; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 2705; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 2706; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 2707; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 2708; AVX512-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} 2709; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 2710; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2711; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 2712; AVX512-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} 2713; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 2714; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2715; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 2716; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] 2717; AVX512-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 2718; AVX512-NEXT: movw $992, %di # imm = 0x3E0 2719; AVX512-NEXT: kmovw %edi, %k1 2720; AVX512-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} 2721; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 2722; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2723; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 2724; AVX512-NEXT: movb $-32, %dil 2725; AVX512-NEXT: kmovw %edi, %k2 2726; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} 2727; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 2728; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2729; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 2730; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] 2731; AVX512-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 2732; AVX512-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} 2733; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 2734; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 2735; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 2736; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} 2737; AVX512-NEXT: vmovdqa64 %zmm2, (%rsi) 2738; AVX512-NEXT: vmovdqa64 %zmm7, (%rdx) 2739; AVX512-NEXT: vmovdqa64 %zmm9, (%rcx) 2740; AVX512-NEXT: vmovdqa64 %zmm10, (%r8) 2741; AVX512-NEXT: vmovdqa64 %zmm11, (%r9) 2742; AVX512-NEXT: vmovdqa64 %zmm5, (%rax) 2743; AVX512-NEXT: vzeroupper 2744; AVX512-NEXT: retq 2745; 2746; AVX512-FCP-LABEL: load_i32_stride6_vf16: 2747; AVX512-FCP: # %bb.0: 2748; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2749; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 2750; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 2751; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 2752; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 2753; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 2754; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm6 2755; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 2756; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2757; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 2758; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] 2759; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 2760; AVX512-FCP-NEXT: movb $56, %dil 2761; AVX512-FCP-NEXT: kmovw %edi, %k2 2762; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} 2763; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 2764; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2765; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 2766; AVX512-FCP-NEXT: movw $-2048, %di # imm = 0xF800 2767; AVX512-FCP-NEXT: kmovw %edi, %k1 2768; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} 2769; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 2770; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2771; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 2772; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] 2773; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 2774; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} 2775; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 2776; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2777; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 2778; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} 2779; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] 2780; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 2781; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 2782; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 2783; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 2784; AVX512-FCP-NEXT: movw $31, %di 2785; AVX512-FCP-NEXT: kmovw %edi, %k2 2786; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} 2787; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 2788; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2789; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 2790; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} 2791; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] 2792; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 2793; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 2794; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 2795; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 2796; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} 2797; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 2798; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2799; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 2800; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} 2801; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 2802; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2803; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 2804; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] 2805; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 2806; AVX512-FCP-NEXT: movw $992, %di # imm = 0x3E0 2807; AVX512-FCP-NEXT: kmovw %edi, %k1 2808; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} 2809; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 2810; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2811; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 2812; AVX512-FCP-NEXT: movb $-32, %dil 2813; AVX512-FCP-NEXT: kmovw %edi, %k2 2814; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} 2815; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 2816; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2817; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 2818; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] 2819; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 2820; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} 2821; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 2822; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 2823; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 2824; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} 2825; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) 2826; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) 2827; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) 2828; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%r8) 2829; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%r9) 2830; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) 2831; AVX512-FCP-NEXT: vzeroupper 2832; AVX512-FCP-NEXT: retq 2833; 2834; AVX512DQ-LABEL: load_i32_stride6_vf16: 2835; AVX512DQ: # %bb.0: 2836; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 2837; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm0 2838; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm1 2839; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm3 2840; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm4 2841; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm5 2842; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm6 2843; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 2844; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2845; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 2846; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] 2847; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 2848; AVX512DQ-NEXT: movb $56, %dil 2849; AVX512DQ-NEXT: kmovw %edi, %k2 2850; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} 2851; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 2852; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2853; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 2854; AVX512DQ-NEXT: movw $-2048, %di # imm = 0xF800 2855; AVX512DQ-NEXT: kmovw %edi, %k1 2856; AVX512DQ-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} 2857; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 2858; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2859; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 2860; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] 2861; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 2862; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} 2863; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 2864; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2865; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 2866; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} 2867; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] 2868; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 2869; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 2870; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 2871; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 2872; AVX512DQ-NEXT: movw $31, %di 2873; AVX512DQ-NEXT: kmovw %edi, %k2 2874; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} 2875; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 2876; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2877; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 2878; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} 2879; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] 2880; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 2881; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 2882; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 2883; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 2884; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} 2885; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 2886; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2887; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 2888; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} 2889; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 2890; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2891; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 2892; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] 2893; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 2894; AVX512DQ-NEXT: movw $992, %di # imm = 0x3E0 2895; AVX512DQ-NEXT: kmovw %edi, %k1 2896; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} 2897; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 2898; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2899; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 2900; AVX512DQ-NEXT: movb $-32, %dil 2901; AVX512DQ-NEXT: kmovw %edi, %k2 2902; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} 2903; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 2904; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2905; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 2906; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] 2907; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 2908; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} 2909; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 2910; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 2911; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 2912; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} 2913; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rsi) 2914; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rdx) 2915; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rcx) 2916; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%r8) 2917; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%r9) 2918; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rax) 2919; AVX512DQ-NEXT: vzeroupper 2920; AVX512DQ-NEXT: retq 2921; 2922; AVX512DQ-FCP-LABEL: load_i32_stride6_vf16: 2923; AVX512DQ-FCP: # %bb.0: 2924; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2925; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 2926; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 2927; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 2928; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 2929; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 2930; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm6 2931; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 2932; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2933; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 2934; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] 2935; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 2936; AVX512DQ-FCP-NEXT: movb $56, %dil 2937; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 2938; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} 2939; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 2940; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2941; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 2942; AVX512DQ-FCP-NEXT: movw $-2048, %di # imm = 0xF800 2943; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 2944; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} 2945; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 2946; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2947; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 2948; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] 2949; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 2950; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} 2951; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 2952; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2953; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 2954; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} 2955; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] 2956; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 2957; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 2958; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 2959; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 2960; AVX512DQ-FCP-NEXT: movw $31, %di 2961; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 2962; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} 2963; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 2964; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2965; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 2966; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} 2967; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] 2968; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 2969; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 2970; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 2971; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 2972; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} 2973; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 2974; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2975; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 2976; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} 2977; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 2978; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2979; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 2980; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] 2981; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 2982; AVX512DQ-FCP-NEXT: movw $992, %di # imm = 0x3E0 2983; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 2984; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} 2985; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 2986; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2987; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 2988; AVX512DQ-FCP-NEXT: movb $-32, %dil 2989; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 2990; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} 2991; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 2992; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2993; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 2994; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] 2995; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 2996; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} 2997; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 2998; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 2999; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 3000; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} 3001; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) 3002; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) 3003; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) 3004; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%r8) 3005; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%r9) 3006; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax) 3007; AVX512DQ-FCP-NEXT: vzeroupper 3008; AVX512DQ-FCP-NEXT: retq 3009; 3010; AVX512BW-LABEL: load_i32_stride6_vf16: 3011; AVX512BW: # %bb.0: 3012; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 3013; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 3014; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 3015; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 3016; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 3017; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 3018; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 3019; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 3020; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 3021; AVX512BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 3022; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] 3023; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 3024; AVX512BW-NEXT: movb $56, %dil 3025; AVX512BW-NEXT: kmovd %edi, %k2 3026; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} 3027; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 3028; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 3029; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 3030; AVX512BW-NEXT: movw $-2048, %di # imm = 0xF800 3031; AVX512BW-NEXT: kmovd %edi, %k1 3032; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} 3033; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 3034; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3035; AVX512BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 3036; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] 3037; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 3038; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} 3039; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 3040; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3041; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 3042; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} 3043; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] 3044; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 3045; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 3046; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3047; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 3048; AVX512BW-NEXT: movw $31, %di 3049; AVX512BW-NEXT: kmovd %edi, %k2 3050; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} 3051; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 3052; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3053; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 3054; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} 3055; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] 3056; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 3057; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 3058; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 3059; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 3060; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} 3061; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 3062; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3063; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 3064; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} 3065; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 3066; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3067; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 3068; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] 3069; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 3070; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 3071; AVX512BW-NEXT: kmovd %edi, %k1 3072; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} 3073; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 3074; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3075; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 3076; AVX512BW-NEXT: movb $-32, %dil 3077; AVX512BW-NEXT: kmovd %edi, %k2 3078; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} 3079; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 3080; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3081; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 3082; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] 3083; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 3084; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} 3085; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 3086; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 3087; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 3088; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} 3089; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) 3090; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rdx) 3091; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rcx) 3092; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r8) 3093; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r9) 3094; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) 3095; AVX512BW-NEXT: vzeroupper 3096; AVX512BW-NEXT: retq 3097; 3098; AVX512BW-FCP-LABEL: load_i32_stride6_vf16: 3099; AVX512BW-FCP: # %bb.0: 3100; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3101; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 3102; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 3103; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 3104; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 3105; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 3106; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm6 3107; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 3108; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 3109; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 3110; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] 3111; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 3112; AVX512BW-FCP-NEXT: movb $56, %dil 3113; AVX512BW-FCP-NEXT: kmovd %edi, %k2 3114; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} 3115; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 3116; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 3117; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 3118; AVX512BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 3119; AVX512BW-FCP-NEXT: kmovd %edi, %k1 3120; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} 3121; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 3122; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3123; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 3124; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] 3125; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 3126; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} 3127; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 3128; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3129; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 3130; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} 3131; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] 3132; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 3133; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 3134; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3135; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 3136; AVX512BW-FCP-NEXT: movw $31, %di 3137; AVX512BW-FCP-NEXT: kmovd %edi, %k2 3138; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} 3139; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 3140; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3141; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 3142; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} 3143; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] 3144; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 3145; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 3146; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 3147; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 3148; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} 3149; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 3150; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3151; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 3152; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} 3153; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 3154; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3155; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 3156; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] 3157; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 3158; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 3159; AVX512BW-FCP-NEXT: kmovd %edi, %k1 3160; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} 3161; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 3162; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3163; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 3164; AVX512BW-FCP-NEXT: movb $-32, %dil 3165; AVX512BW-FCP-NEXT: kmovd %edi, %k2 3166; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} 3167; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 3168; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3169; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 3170; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] 3171; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 3172; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} 3173; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 3174; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 3175; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 3176; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} 3177; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) 3178; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) 3179; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) 3180; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8) 3181; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9) 3182; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) 3183; AVX512BW-FCP-NEXT: vzeroupper 3184; AVX512BW-FCP-NEXT: retq 3185; 3186; AVX512DQ-BW-LABEL: load_i32_stride6_vf16: 3187; AVX512DQ-BW: # %bb.0: 3188; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 3189; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm0 3190; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1 3191; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3 3192; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm4 3193; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm5 3194; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm6 3195; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 3196; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 3197; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 3198; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] 3199; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 3200; AVX512DQ-BW-NEXT: movb $56, %dil 3201; AVX512DQ-BW-NEXT: kmovd %edi, %k2 3202; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} 3203; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 3204; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 3205; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 3206; AVX512DQ-BW-NEXT: movw $-2048, %di # imm = 0xF800 3207; AVX512DQ-BW-NEXT: kmovd %edi, %k1 3208; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} 3209; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 3210; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3211; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 3212; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] 3213; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 3214; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} 3215; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 3216; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3217; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 3218; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} 3219; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] 3220; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 3221; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 3222; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3223; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 3224; AVX512DQ-BW-NEXT: movw $31, %di 3225; AVX512DQ-BW-NEXT: kmovd %edi, %k2 3226; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} 3227; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 3228; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3229; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 3230; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} 3231; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] 3232; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 3233; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 3234; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 3235; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 3236; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} 3237; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 3238; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3239; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 3240; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} 3241; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 3242; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3243; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 3244; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] 3245; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 3246; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 3247; AVX512DQ-BW-NEXT: kmovd %edi, %k1 3248; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} 3249; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 3250; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3251; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 3252; AVX512DQ-BW-NEXT: movb $-32, %dil 3253; AVX512DQ-BW-NEXT: kmovd %edi, %k2 3254; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} 3255; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 3256; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3257; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 3258; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] 3259; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 3260; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} 3261; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 3262; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 3263; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 3264; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} 3265; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi) 3266; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rdx) 3267; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rcx) 3268; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%r8) 3269; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%r9) 3270; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rax) 3271; AVX512DQ-BW-NEXT: vzeroupper 3272; AVX512DQ-BW-NEXT: retq 3273; 3274; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf16: 3275; AVX512DQ-BW-FCP: # %bb.0: 3276; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3277; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 3278; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 3279; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 3280; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 3281; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 3282; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm6 3283; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 3284; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 3285; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 3286; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] 3287; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 3288; AVX512DQ-BW-FCP-NEXT: movb $56, %dil 3289; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 3290; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} 3291; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 3292; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 3293; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 3294; AVX512DQ-BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 3295; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 3296; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} 3297; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 3298; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3299; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 3300; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] 3301; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 3302; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} 3303; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 3304; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3305; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 3306; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} 3307; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] 3308; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 3309; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 3310; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 3311; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 3312; AVX512DQ-BW-FCP-NEXT: movw $31, %di 3313; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 3314; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} 3315; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 3316; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3317; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 3318; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} 3319; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] 3320; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 3321; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 3322; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 3323; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 3324; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} 3325; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 3326; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3327; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 3328; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} 3329; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 3330; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3331; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 3332; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] 3333; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 3334; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 3335; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 3336; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} 3337; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 3338; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3339; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 3340; AVX512DQ-BW-FCP-NEXT: movb $-32, %dil 3341; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 3342; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} 3343; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 3344; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 3345; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 3346; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] 3347; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 3348; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} 3349; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 3350; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 3351; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 3352; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} 3353; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) 3354; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) 3355; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) 3356; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8) 3357; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9) 3358; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) 3359; AVX512DQ-BW-FCP-NEXT: vzeroupper 3360; AVX512DQ-BW-FCP-NEXT: retq 3361 %wide.vec = load <96 x i32>, ptr %in.vec, align 64 3362 %strided.vec0 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <16 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90> 3363 %strided.vec1 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <16 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91> 3364 %strided.vec2 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <16 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92> 3365 %strided.vec3 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <16 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93> 3366 %strided.vec4 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <16 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94> 3367 %strided.vec5 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <16 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95> 3368 store <16 x i32> %strided.vec0, ptr %out.vec0, align 64 3369 store <16 x i32> %strided.vec1, ptr %out.vec1, align 64 3370 store <16 x i32> %strided.vec2, ptr %out.vec2, align 64 3371 store <16 x i32> %strided.vec3, ptr %out.vec3, align 64 3372 store <16 x i32> %strided.vec4, ptr %out.vec4, align 64 3373 store <16 x i32> %strided.vec5, ptr %out.vec5, align 64 3374 ret void 3375} 3376 3377define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 3378; SSE-LABEL: load_i32_stride6_vf32: 3379; SSE: # %bb.0: 3380; SSE-NEXT: subq $1032, %rsp # imm = 0x408 3381; SSE-NEXT: movdqa 64(%rdi), %xmm5 3382; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3383; SSE-NEXT: movdqa (%rdi), %xmm12 3384; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3385; SSE-NEXT: movdqa 16(%rdi), %xmm13 3386; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3387; SSE-NEXT: movdqa 48(%rdi), %xmm9 3388; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3389; SSE-NEXT: movdqa 528(%rdi), %xmm7 3390; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3391; SSE-NEXT: movdqa 544(%rdi), %xmm3 3392; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3393; SSE-NEXT: movdqa 480(%rdi), %xmm8 3394; SSE-NEXT: movdqa 496(%rdi), %xmm4 3395; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3396; SSE-NEXT: movdqa 144(%rdi), %xmm10 3397; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3398; SSE-NEXT: movdqa 160(%rdi), %xmm2 3399; SSE-NEXT: movdqa 96(%rdi), %xmm1 3400; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3401; SSE-NEXT: movdqa 112(%rdi), %xmm11 3402; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] 3403; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3404; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3405; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] 3406; SSE-NEXT: movdqa %xmm2, %xmm6 3407; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3408; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] 3409; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 3410; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 3411; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3412; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] 3413; SSE-NEXT: movdqa %xmm8, %xmm1 3414; SSE-NEXT: movdqa %xmm8, %xmm4 3415; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3416; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3417; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] 3418; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] 3419; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 3420; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 3421; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3422; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] 3423; SSE-NEXT: movdqa %xmm12, %xmm1 3424; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3425; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] 3426; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,1,1] 3427; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 3428; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 3429; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3430; SSE-NEXT: movdqa 384(%rdi), %xmm1 3431; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3432; SSE-NEXT: movdqa 400(%rdi), %xmm14 3433; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] 3434; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3435; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3436; SSE-NEXT: movdqa 432(%rdi), %xmm3 3437; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill 3438; SSE-NEXT: movdqa 448(%rdi), %xmm9 3439; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] 3440; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3441; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] 3442; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 3443; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 3444; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3445; SSE-NEXT: movdqa 288(%rdi), %xmm1 3446; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3447; SSE-NEXT: movdqa 304(%rdi), %xmm15 3448; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] 3449; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3450; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3451; SSE-NEXT: movdqa 336(%rdi), %xmm3 3452; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3453; SSE-NEXT: movdqa 352(%rdi), %xmm12 3454; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] 3455; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3456; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] 3457; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 3458; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 3459; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3460; SSE-NEXT: movdqa 672(%rdi), %xmm1 3461; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3462; SSE-NEXT: movdqa 688(%rdi), %xmm8 3463; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] 3464; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3465; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3466; SSE-NEXT: movdqa 720(%rdi), %xmm3 3467; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3468; SSE-NEXT: movdqa 736(%rdi), %xmm5 3469; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] 3470; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3471; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] 3472; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 3473; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 3474; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3475; SSE-NEXT: movdqa 192(%rdi), %xmm1 3476; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3477; SSE-NEXT: movdqa 208(%rdi), %xmm0 3478; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3479; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 3480; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3481; SSE-NEXT: movdqa 240(%rdi), %xmm2 3482; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3483; SSE-NEXT: movdqa 256(%rdi), %xmm0 3484; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3485; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 3486; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] 3487; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 3488; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 3489; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3490; SSE-NEXT: movdqa 576(%rdi), %xmm7 3491; SSE-NEXT: movdqa 592(%rdi), %xmm13 3492; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] 3493; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3494; SSE-NEXT: movdqa %xmm7, %xmm1 3495; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3496; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3497; SSE-NEXT: movdqa 624(%rdi), %xmm10 3498; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3499; SSE-NEXT: movdqa 640(%rdi), %xmm3 3500; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] 3501; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3502; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] 3503; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 3504; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 3505; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3506; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3507; SSE-NEXT: # xmm0 = mem[1,1,1,1] 3508; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] 3509; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3510; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] 3511; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3512; SSE-NEXT: movdqa %xmm10, %xmm2 3513; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 3514; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 3515; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3516; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] 3517; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3518; SSE-NEXT: # xmm1 = mem[3,3,3,3] 3519; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3520; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3521; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3522; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 3523; SSE-NEXT: movdqa %xmm11, %xmm2 3524; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 3525; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 3526; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3527; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3528; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] 3529; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3530; SSE-NEXT: # xmm1 = mem[3,3,3,3] 3531; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3532; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3533; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3534; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 3535; SSE-NEXT: movdqa %xmm4, %xmm2 3536; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 3537; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 3538; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3539; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3540; SSE-NEXT: # xmm0 = mem[1,1,1,1] 3541; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] 3542; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3543; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] 3544; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload 3545; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 3546; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 3547; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3548; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 3549; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] 3550; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[3,3,3,3] 3551; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3552; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] 3553; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 3554; SSE-NEXT: movdqa %xmm15, %xmm2 3555; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 3556; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 3557; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3558; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3559; SSE-NEXT: # xmm0 = mem[1,1,1,1] 3560; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] 3561; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3562; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] 3563; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 3564; SSE-NEXT: movdqa %xmm8, %xmm2 3565; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 3566; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 3567; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3568; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 3569; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] 3570; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3571; SSE-NEXT: # xmm1 = mem[3,3,3,3] 3572; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3573; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3574; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3575; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3576; SSE-NEXT: movdqa %xmm14, %xmm5 3577; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] 3578; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] 3579; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3580; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] 3581; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] 3582; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3583; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] 3584; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3585; SSE-NEXT: movdqa %xmm5, %xmm3 3586; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 3587; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] 3588; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3589; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] 3590; SSE-NEXT: movdqa 176(%rdi), %xmm0 3591; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3592; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 3593; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3594; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3595; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] 3596; SSE-NEXT: movdqa 128(%rdi), %xmm2 3597; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3598; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 3599; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 3600; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3601; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] 3602; SSE-NEXT: movdqa 80(%rdi), %xmm0 3603; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3604; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 3605; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3606; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] 3607; SSE-NEXT: movdqa 32(%rdi), %xmm13 3608; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] 3609; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 3610; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3611; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] 3612; SSE-NEXT: movdqa 368(%rdi), %xmm0 3613; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3614; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 3615; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3616; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] 3617; SSE-NEXT: movdqa 320(%rdi), %xmm2 3618; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3619; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 3620; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 3621; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3622; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] 3623; SSE-NEXT: movdqa 272(%rdi), %xmm0 3624; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3625; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 3626; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3627; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] 3628; SSE-NEXT: movdqa %xmm12, %xmm14 3629; SSE-NEXT: movdqa 224(%rdi), %xmm6 3630; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 3631; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3632; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 3633; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3634; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] 3635; SSE-NEXT: movdqa 560(%rdi), %xmm0 3636; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3637; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 3638; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3639; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3640; SSE-NEXT: # xmm0 = mem[2,3,2,3] 3641; SSE-NEXT: movdqa 512(%rdi), %xmm2 3642; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3643; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 3644; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 3645; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3646; SSE-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload 3647; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] 3648; SSE-NEXT: movdqa 464(%rdi), %xmm15 3649; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1] 3650; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3651; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 3652; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] 3653; SSE-NEXT: movdqa 416(%rdi), %xmm4 3654; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 3655; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3656; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 3657; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3658; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] 3659; SSE-NEXT: movdqa 752(%rdi), %xmm0 3660; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3661; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 3662; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3663; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3664; SSE-NEXT: # xmm0 = mem[2,3,2,3] 3665; SSE-NEXT: movdqa 704(%rdi), %xmm12 3666; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] 3667; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3668; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 3669; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3670; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] 3671; SSE-NEXT: movdqa 656(%rdi), %xmm0 3672; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3673; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 3674; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3675; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 3676; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 3677; SSE-NEXT: movdqa 608(%rdi), %xmm5 3678; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 3679; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 3680; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3681; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3682; SSE-NEXT: # xmm0 = mem[3,3,3,3] 3683; SSE-NEXT: movdqa %xmm13, %xmm9 3684; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3685; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] 3686; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3687; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3688; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3689; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3690; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] 3691; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 3692; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3693; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] 3694; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 3695; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] 3696; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3697; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3698; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3699; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 3700; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 3701; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 3702; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3703; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] 3704; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] 3705; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3706; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3707; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3708; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3709; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] 3710; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 3711; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3712; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3713; SSE-NEXT: # xmm0 = mem[3,3,3,3] 3714; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3715; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] 3716; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3717; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3718; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3719; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3720; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] 3721; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 3722; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3723; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] 3724; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] 3725; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3726; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] 3727; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3728; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] 3729; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 3730; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3731; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3732; SSE-NEXT: # xmm0 = mem[3,3,3,3] 3733; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 3734; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] 3735; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3736; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3737; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3738; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 3739; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 3740; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 3741; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill 3742; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] 3743; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3744; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] 3745; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3746; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3747; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3748; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 3749; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 3750; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 3751; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3752; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3753; SSE-NEXT: # xmm0 = mem[3,3,3,3] 3754; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] 3755; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3756; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3757; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3758; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 3759; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] 3760; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 3761; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3762; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] 3763; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 3764; SSE-NEXT: movdqa %xmm12, %xmm1 3765; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3766; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] 3767; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 3768; SSE-NEXT: # xmm9 = mem[0,0,1,1] 3769; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] 3770; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] 3771; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3772; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] 3773; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3774; SSE-NEXT: movdqa %xmm13, %xmm1 3775; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3776; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] 3777; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3778; SSE-NEXT: # xmm3 = mem[0,0,1,1] 3779; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 3780; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] 3781; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3782; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3783; SSE-NEXT: # xmm0 = mem[2,3,2,3] 3784; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3785; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3786; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] 3787; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3788; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[0,0,1,1] 3789; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] 3790; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] 3791; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] 3792; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3793; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3794; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] 3795; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 3796; SSE-NEXT: # xmm8 = mem[0,0,1,1] 3797; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] 3798; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] 3799; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3800; SSE-NEXT: # xmm0 = mem[2,3,2,3] 3801; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3802; SSE-NEXT: movdqa %xmm10, %xmm1 3803; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3804; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] 3805; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 3806; SSE-NEXT: # xmm6 = mem[0,0,1,1] 3807; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] 3808; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] 3809; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] 3810; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3811; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3812; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] 3813; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 3814; SSE-NEXT: # xmm4 = mem[0,0,1,1] 3815; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 3816; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] 3817; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] 3818; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3819; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3820; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] 3821; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3822; SSE-NEXT: # xmm3 = mem[0,0,1,1] 3823; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 3824; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] 3825; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3826; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] 3827; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3828; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3829; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] 3830; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 3831; SSE-NEXT: # xmm2 = mem[0,0,1,1] 3832; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 3833; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 3834; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] 3835; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3836; SSE-NEXT: # xmm1 = mem[3,3,3,3] 3837; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3838; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3839; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3840; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 3841; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] 3842; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] 3843; SSE-NEXT: movapd %xmm15, %xmm7 3844; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] 3845; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3846; SSE-NEXT: # xmm1 = mem[3,3,3,3] 3847; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3848; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3849; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3850; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 3851; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] 3852; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] 3853; SSE-NEXT: movapd %xmm15, %xmm11 3854; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3855; SSE-NEXT: # xmm0 = mem[1,1,1,1] 3856; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3857; SSE-NEXT: # xmm1 = mem[3,3,3,3] 3858; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3859; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3860; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3861; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] 3862; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] 3863; SSE-NEXT: movapd %xmm14, %xmm12 3864; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3865; SSE-NEXT: # xmm0 = mem[1,1,1,1] 3866; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3867; SSE-NEXT: # xmm1 = mem[3,3,3,3] 3868; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3869; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3870; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3871; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3872; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] 3873; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] 3874; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3875; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] 3876; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3877; SSE-NEXT: # xmm1 = mem[3,3,3,3] 3878; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3879; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3880; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3881; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 3882; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] 3883; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] 3884; SSE-NEXT: movapd %xmm15, %xmm10 3885; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3886; SSE-NEXT: # xmm0 = mem[1,1,1,1] 3887; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3888; SSE-NEXT: # xmm1 = mem[3,3,3,3] 3889; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3890; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3891; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3892; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 3893; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] 3894; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] 3895; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3896; SSE-NEXT: # xmm0 = mem[1,1,1,1] 3897; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3898; SSE-NEXT: # xmm1 = mem[3,3,3,3] 3899; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3900; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3901; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3902; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3903; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] 3904; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] 3905; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3906; SSE-NEXT: # xmm0 = mem[1,1,1,1] 3907; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] 3908; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3909; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3910; SSE-NEXT: # xmm1 = mem[2,3,2,3] 3911; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3912; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] 3913; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] 3914; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3915; SSE-NEXT: movaps %xmm0, 96(%rsi) 3916; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3917; SSE-NEXT: movaps %xmm0, 32(%rsi) 3918; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3919; SSE-NEXT: movaps %xmm0, 112(%rsi) 3920; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3921; SSE-NEXT: movaps %xmm0, 48(%rsi) 3922; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3923; SSE-NEXT: movaps %xmm0, 64(%rsi) 3924; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3925; SSE-NEXT: movaps %xmm0, (%rsi) 3926; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3927; SSE-NEXT: movaps %xmm0, 80(%rsi) 3928; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3929; SSE-NEXT: movaps %xmm0, 16(%rsi) 3930; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3931; SSE-NEXT: movaps %xmm0, 96(%rdx) 3932; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3933; SSE-NEXT: movaps %xmm0, 32(%rdx) 3934; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3935; SSE-NEXT: movaps %xmm0, 112(%rdx) 3936; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3937; SSE-NEXT: movaps %xmm0, 48(%rdx) 3938; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3939; SSE-NEXT: movaps %xmm0, 64(%rdx) 3940; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3941; SSE-NEXT: movaps %xmm0, (%rdx) 3942; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3943; SSE-NEXT: movaps %xmm0, 80(%rdx) 3944; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3945; SSE-NEXT: movaps %xmm0, 16(%rdx) 3946; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3947; SSE-NEXT: movaps %xmm0, 96(%rcx) 3948; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3949; SSE-NEXT: movaps %xmm0, 112(%rcx) 3950; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3951; SSE-NEXT: movaps %xmm0, 64(%rcx) 3952; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3953; SSE-NEXT: movaps %xmm0, 80(%rcx) 3954; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3955; SSE-NEXT: movaps %xmm0, 32(%rcx) 3956; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3957; SSE-NEXT: movaps %xmm0, 48(%rcx) 3958; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3959; SSE-NEXT: movaps %xmm0, (%rcx) 3960; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3961; SSE-NEXT: movaps %xmm0, 16(%rcx) 3962; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3963; SSE-NEXT: movaps %xmm0, 112(%r8) 3964; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3965; SSE-NEXT: movaps %xmm0, 96(%r8) 3966; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 3967; SSE-NEXT: movaps %xmm0, 80(%r8) 3968; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3969; SSE-NEXT: movaps %xmm0, 64(%r8) 3970; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3971; SSE-NEXT: movaps %xmm0, 48(%r8) 3972; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3973; SSE-NEXT: movaps %xmm0, 32(%r8) 3974; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3975; SSE-NEXT: movaps %xmm0, 16(%r8) 3976; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3977; SSE-NEXT: movaps %xmm0, (%r8) 3978; SSE-NEXT: movapd %xmm2, 112(%r9) 3979; SSE-NEXT: movapd %xmm3, 96(%r9) 3980; SSE-NEXT: movapd %xmm4, 80(%r9) 3981; SSE-NEXT: movapd %xmm6, 64(%r9) 3982; SSE-NEXT: movapd %xmm8, 48(%r9) 3983; SSE-NEXT: movapd %xmm9, 32(%r9) 3984; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3985; SSE-NEXT: movaps %xmm0, 16(%r9) 3986; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3987; SSE-NEXT: movaps %xmm0, (%r9) 3988; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 3989; SSE-NEXT: movapd %xmm14, 112(%rax) 3990; SSE-NEXT: movapd %xmm13, 96(%rax) 3991; SSE-NEXT: movapd %xmm15, 80(%rax) 3992; SSE-NEXT: movapd %xmm10, 64(%rax) 3993; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3994; SSE-NEXT: movaps %xmm0, 48(%rax) 3995; SSE-NEXT: movapd %xmm12, 32(%rax) 3996; SSE-NEXT: movapd %xmm11, 16(%rax) 3997; SSE-NEXT: movapd %xmm7, (%rax) 3998; SSE-NEXT: addq $1032, %rsp # imm = 0x408 3999; SSE-NEXT: retq 4000; 4001; AVX-LABEL: load_i32_stride6_vf32: 4002; AVX: # %bb.0: 4003; AVX-NEXT: subq $1032, %rsp # imm = 0x408 4004; AVX-NEXT: vmovaps 416(%rdi), %ymm9 4005; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4006; AVX-NEXT: vmovaps 480(%rdi), %ymm4 4007; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4008; AVX-NEXT: vmovaps 448(%rdi), %ymm5 4009; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4010; AVX-NEXT: vmovapd 160(%rdi), %ymm2 4011; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4012; AVX-NEXT: vmovapd 128(%rdi), %ymm3 4013; AVX-NEXT: vmovupd %ymm3, (%rsp) # 32-byte Spill 4014; AVX-NEXT: vmovaps 32(%rdi), %ymm6 4015; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4016; AVX-NEXT: vmovaps (%rdi), %ymm7 4017; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4018; AVX-NEXT: vmovaps 96(%rdi), %ymm1 4019; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4020; AVX-NEXT: vmovaps 64(%rdi), %ymm0 4021; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4022; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm8 4023; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm8[0,0],ymm1[6,4],ymm8[4,4] 4024; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,2],ymm0[6,4],ymm8[6,6] 4025; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] 4026; AVX-NEXT: vextractf128 $1, %ymm12, %xmm7 4027; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm7[2,3] 4028; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,3] 4029; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 4030; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm2[0,1] 4031; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4032; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[3],ymm3[2] 4033; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 4034; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 4035; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4036; AVX-NEXT: vinsertf128 $1, 480(%rdi), %ymm5, %ymm6 4037; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm6[0,0],ymm4[6,4],ymm6[4,4] 4038; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,2],ymm0[6,4],ymm6[6,6] 4039; AVX-NEXT: vmovaps 384(%rdi), %ymm1 4040; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4041; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm9[4,5],ymm1[6,7] 4042; AVX-NEXT: vextractf128 $1, %ymm13, %xmm5 4043; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm5[2,3] 4044; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,3] 4045; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 4046; AVX-NEXT: vmovapd 544(%rdi), %ymm1 4047; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4048; AVX-NEXT: vmovapd 512(%rdi), %ymm2 4049; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4050; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] 4051; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4052; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2] 4053; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 4054; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 4055; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4056; AVX-NEXT: vmovaps 288(%rdi), %ymm1 4057; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4058; AVX-NEXT: vmovaps 256(%rdi), %ymm0 4059; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4060; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm3 4061; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm3[0,0],ymm1[6,4],ymm3[4,4] 4062; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,2],ymm0[6,4],ymm3[6,6] 4063; AVX-NEXT: vmovaps 224(%rdi), %ymm1 4064; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4065; AVX-NEXT: vmovaps 192(%rdi), %ymm2 4066; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4067; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] 4068; AVX-NEXT: vextractf128 $1, %ymm11, %xmm2 4069; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm2[2,3] 4070; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,3] 4071; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 4072; AVX-NEXT: vmovapd 352(%rdi), %ymm1 4073; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4074; AVX-NEXT: vmovapd 320(%rdi), %ymm4 4075; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4076; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[0,1] 4077; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4078; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[3],ymm4[2] 4079; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 4080; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 4081; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4082; AVX-NEXT: vmovaps 672(%rdi), %ymm1 4083; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4084; AVX-NEXT: vmovaps 640(%rdi), %ymm0 4085; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4086; AVX-NEXT: vinsertf128 $1, 672(%rdi), %ymm0, %ymm9 4087; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm9[0,0],ymm1[6,4],ymm9[4,4] 4088; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm9[2,2],ymm0[6,4],ymm9[6,6] 4089; AVX-NEXT: vmovaps 608(%rdi), %ymm0 4090; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4091; AVX-NEXT: vmovaps 576(%rdi), %ymm1 4092; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4093; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 4094; AVX-NEXT: vextractf128 $1, %ymm4, %xmm1 4095; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm4[0,1],xmm1[2,3] 4096; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm1[0,3] 4097; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3,4,5,6,7] 4098; AVX-NEXT: vmovapd 736(%rdi), %ymm10 4099; AVX-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4100; AVX-NEXT: vmovapd 704(%rdi), %ymm0 4101; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4102; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm10[0,1] 4103; AVX-NEXT: vshufpd {{.*#+}} ymm14 = ymm10[0],ymm0[1],ymm10[3],ymm0[2] 4104; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] 4105; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] 4106; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4107; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 4108; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm15[3,0],ymm8[1,0],ymm15[7,4],ymm8[5,4] 4109; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm14[2,0],ymm8[2,3],ymm14[6,4],ymm8[6,7] 4110; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,0],xmm7[3,0] 4111; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm12[0,2],xmm7[1,3] 4112; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] 4113; AVX-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload 4114; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4115; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm12[1,3],ymm0[7,5],ymm12[5,7] 4116; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] 4117; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] 4118; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4119; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 4120; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm8[3,0],ymm6[1,0],ymm8[7,4],ymm6[5,4] 4121; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] 4122; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm13[1,0],xmm5[3,0] 4123; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm7[0,2],xmm5[1,3] 4124; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] 4125; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 4126; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4127; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm0[3,1],ymm14[1,3],ymm0[7,5],ymm14[5,7] 4128; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] 4129; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] 4130; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4131; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4132; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,0],ymm3[1,0],ymm0[7,4],ymm3[5,4] 4133; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm3[2,3],ymm5[6,4],ymm3[6,7] 4134; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm11[1,0],xmm2[3,0] 4135; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm5[0,2],xmm2[1,3] 4136; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] 4137; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4138; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 4139; AVX-NEXT: # ymm3 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7] 4140; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] 4141; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 4142; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4143; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 4144; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,0],ymm9[1,0],ymm11[7,4],ymm9[5,4] 4145; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,0],ymm9[2,3],ymm2[6,4],ymm9[6,7] 4146; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,0],xmm1[3,0] 4147; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm1[1,3] 4148; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] 4149; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 4150; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,1],ymm13[1,3],ymm10[7,5],ymm13[5,7] 4151; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 4152; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 4153; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4154; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4155; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 4156; AVX-NEXT: # ymm4 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 4157; AVX-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload 4158; AVX-NEXT: # ymm1 = ymm15[2,1],mem[2,0],ymm15[6,5],mem[6,4] 4159; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] 4160; AVX-NEXT: vextractf128 $1, %ymm4, %xmm0 4161; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4162; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm0[2,3] 4163; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] 4164; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload 4165; AVX-NEXT: # ymm3 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7] 4166; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] 4167; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4168; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm3[2,0],ymm0[4,4],ymm3[6,4] 4169; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] 4170; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4171; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4172; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 4173; AVX-NEXT: # ymm6 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 4174; AVX-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload 4175; AVX-NEXT: # ymm5 = ymm8[2,1],mem[2,0],ymm8[6,5],mem[6,4] 4176; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] 4177; AVX-NEXT: vextractf128 $1, %ymm6, %xmm0 4178; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4179; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,0],xmm0[2,3] 4180; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] 4181; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload 4182; AVX-NEXT: # ymm2 = ymm14[0,1,2,3],mem[4,5],ymm14[6,7] 4183; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] 4184; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4185; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,0],ymm2[2,0],ymm0[4,4],ymm2[6,4] 4186; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm7[5,6,7] 4187; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4188; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4189; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload 4190; AVX-NEXT: # ymm9 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 4191; AVX-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload 4192; AVX-NEXT: # ymm7 = ymm11[2,1],mem[2,0],ymm11[6,5],mem[6,4] 4193; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm7[2,3,0,1] 4194; AVX-NEXT: vextractf128 $1, %ymm9, %xmm0 4195; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4196; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm9[2,0],xmm0[2,3] 4197; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2],ymm10[3,4,5,6,7] 4198; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload 4199; AVX-NEXT: # ymm10 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] 4200; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3,0,1] 4201; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4] 4202; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm13[5,6,7] 4203; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4204; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4205; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload 4206; AVX-NEXT: # ymm12 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 4207; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 4208; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 4209; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm5[2,1],ymm8[2,0],ymm5[6,5],ymm8[6,4] 4210; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,0,1] 4211; AVX-NEXT: vextractf128 $1, %ymm12, %xmm14 4212; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,0],xmm14[2,3] 4213; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] 4214; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4215; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload 4216; AVX-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] 4217; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1] 4218; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,0],ymm15[2,0],ymm0[4,4],ymm15[6,4] 4219; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7] 4220; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4221; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4222; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm3[3,1],ymm1[4,5],ymm3[7,5] 4223; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload 4224; AVX-NEXT: # xmm3 = xmm4[3,1],mem[3,3] 4225; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 4226; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 4227; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm11[3,1],ymm13[2,1],ymm11[7,5],ymm13[6,5] 4228; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1] 4229; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] 4230; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] 4231; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4232; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4233; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm2[3,1],ymm1[4,5],ymm2[7,5] 4234; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload 4235; AVX-NEXT: # xmm2 = xmm6[3,1],mem[3,3] 4236; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4237; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 4238; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm6[2,1],ymm4[7,5],ymm6[6,5] 4239; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] 4240; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] 4241; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 4242; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4243; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm15[3,1],ymm0[4,5],ymm15[7,5] 4244; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,1],xmm14[3,3] 4245; AVX-NEXT: vmovaps %ymm5, %ymm3 4246; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm8[2,1],ymm5[7,5],ymm8[6,5] 4247; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] 4248; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 4249; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] 4250; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4251; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,1],ymm10[3,1],ymm7[4,5],ymm10[7,5] 4252; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload 4253; AVX-NEXT: # xmm1 = xmm9[3,1],mem[3,3] 4254; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 4255; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4256; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm12[2,1],ymm2[7,5],ymm12[6,5] 4257; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] 4258; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 4259; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] 4260; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4261; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4262; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 4263; AVX-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] 4264; AVX-NEXT: vmovaps 416(%rdi), %xmm0 4265; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4266; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] 4267; AVX-NEXT: vmovaps 400(%rdi), %xmm1 4268; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4269; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 4270; AVX-NEXT: vmovapd 464(%rdi), %xmm1 4271; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4272; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[3] 4273; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] 4274; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 4275; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4276; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] 4277; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm7[0,0],ymm2[6,4],ymm7[4,4] 4278; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm1[2,0],ymm7[4,6],ymm1[6,4] 4279; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 4280; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4281; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4282; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 4283; AVX-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] 4284; AVX-NEXT: vmovaps 32(%rdi), %xmm0 4285; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4286; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] 4287; AVX-NEXT: vmovaps 16(%rdi), %xmm15 4288; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3] 4289; AVX-NEXT: vmovapd 80(%rdi), %xmm10 4290; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[1],ymm13[0],ymm10[2],ymm13[3] 4291; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,0],ymm11[4,5],ymm1[6,4] 4292; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 4293; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] 4294; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm1[0,0],ymm6[6,4],ymm1[4,4] 4295; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm2[2,0],ymm1[4,6],ymm2[6,4] 4296; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 4297; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4298; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4299; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 4300; AVX-NEXT: # ymm5 = mem[0,1],ymm0[2,3],mem[4,5,6,7] 4301; AVX-NEXT: vmovaps 224(%rdi), %xmm0 4302; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4303; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] 4304; AVX-NEXT: vmovaps 208(%rdi), %xmm13 4305; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3] 4306; AVX-NEXT: vmovapd 272(%rdi), %xmm2 4307; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4308; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm8[0],ymm2[2],ymm8[3] 4309; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] 4310; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] 4311; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3,0,1] 4312; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm2[0,0],ymm5[6,4],ymm2[4,4] 4313; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,2],ymm3[2,0],ymm2[4,6],ymm3[6,4] 4314; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] 4315; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4316; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4317; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 4318; AVX-NEXT: # ymm3 = mem[0,1],ymm0[2,3],mem[4,5,6,7] 4319; AVX-NEXT: vmovaps 608(%rdi), %xmm11 4320; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm11[2,2,3,3] 4321; AVX-NEXT: vmovaps 592(%rdi), %xmm8 4322; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3] 4323; AVX-NEXT: vmovapd 656(%rdi), %xmm9 4324; AVX-NEXT: vshufpd {{.*#+}} ymm14 = ymm9[1],ymm12[0],ymm9[2],ymm12[3] 4325; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4326; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm4[0,1],ymm14[2,0],ymm4[4,5],ymm14[6,4] 4327; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm14[2,3,4,5,6,7] 4328; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] 4329; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm3[2,0],ymm0[0,0],ymm3[6,4],ymm0[4,4] 4330; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4] 4331; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm12[5,6,7] 4332; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm1[1,0],ymm6[7,4],ymm1[5,4] 4333; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm6[2,0],ymm1[4,7],ymm6[6,4] 4334; AVX-NEXT: vblendps $12, (%rsp), %xmm15, %xmm6 # 16-byte Folded Reload 4335; AVX-NEXT: # xmm6 = xmm15[0,1],mem[2,3] 4336; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload 4337; AVX-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] 4338; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 4339; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4] 4340; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,3,2,3] 4341; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2,3,4,5,6,7] 4342; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm1[5,6,7] 4343; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4344; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm7[1,0],ymm1[7,4],ymm7[5,4] 4345; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,3],ymm1[2,0],ymm7[4,7],ymm1[6,4] 4346; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4347; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload 4348; AVX-NEXT: # xmm7 = xmm7[0,1],mem[2,3] 4349; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 4350; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload 4351; AVX-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] 4352; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 4353; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4] 4354; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,3,2,3] 4355; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3,4,5,6,7] 4356; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm1[5,6,7] 4357; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,0],ymm2[1,0],ymm5[7,4],ymm2[5,4] 4358; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,3],ymm1[2,0],ymm2[4,7],ymm1[6,4] 4359; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm2 # 16-byte Folded Reload 4360; AVX-NEXT: # xmm2 = xmm13[0,1],mem[2,3] 4361; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 4362; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 4363; AVX-NEXT: # ymm5 = ymm5[3,1],mem[1,3],ymm5[7,5],mem[5,7] 4364; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 4365; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1],ymm5[2,0],ymm10[5,5],ymm5[6,4] 4366; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] 4367; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3,4,5,6,7] 4368; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 4369; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm0[1,0],ymm3[7,4],ymm0[5,4] 4370; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[2,0],ymm0[4,7],ymm2[6,4] 4371; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm11[2,3] 4372; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload 4373; AVX-NEXT: # ymm3 = ymm9[3,1],mem[1,3],ymm9[7,5],mem[5,7] 4374; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1],ymm3[2,0],ymm4[5,5],ymm3[6,4] 4375; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] 4376; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] 4377; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] 4378; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4379; AVX-NEXT: vmovaps %ymm2, 96(%rsi) 4380; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4381; AVX-NEXT: vmovaps %ymm2, 32(%rsi) 4382; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4383; AVX-NEXT: vmovaps %ymm2, 64(%rsi) 4384; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4385; AVX-NEXT: vmovaps %ymm2, (%rsi) 4386; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4387; AVX-NEXT: vmovaps %ymm2, 96(%rdx) 4388; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4389; AVX-NEXT: vmovaps %ymm2, 32(%rdx) 4390; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4391; AVX-NEXT: vmovaps %ymm2, 64(%rdx) 4392; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4393; AVX-NEXT: vmovaps %ymm2, (%rdx) 4394; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4395; AVX-NEXT: vmovaps %ymm2, 32(%rcx) 4396; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4397; AVX-NEXT: vmovaps %ymm2, 96(%rcx) 4398; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4399; AVX-NEXT: vmovaps %ymm2, 64(%rcx) 4400; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4401; AVX-NEXT: vmovaps %ymm2, (%rcx) 4402; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4403; AVX-NEXT: vmovaps %ymm2, 96(%r8) 4404; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4405; AVX-NEXT: vmovaps %ymm2, 32(%r8) 4406; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4407; AVX-NEXT: vmovaps %ymm2, 64(%r8) 4408; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4409; AVX-NEXT: vmovaps %ymm2, (%r8) 4410; AVX-NEXT: vmovaps %ymm14, 96(%r9) 4411; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4412; AVX-NEXT: vmovaps %ymm2, 32(%r9) 4413; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4414; AVX-NEXT: vmovaps %ymm2, (%r9) 4415; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4416; AVX-NEXT: vmovaps %ymm2, 64(%r9) 4417; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 4418; AVX-NEXT: vmovaps %ymm0, 96(%rax) 4419; AVX-NEXT: vmovaps %ymm1, 32(%rax) 4420; AVX-NEXT: vmovaps %ymm7, 64(%rax) 4421; AVX-NEXT: vmovaps %ymm6, (%rax) 4422; AVX-NEXT: addq $1032, %rsp # imm = 0x408 4423; AVX-NEXT: vzeroupper 4424; AVX-NEXT: retq 4425; 4426; AVX2-LABEL: load_i32_stride6_vf32: 4427; AVX2: # %bb.0: 4428; AVX2-NEXT: subq $1224, %rsp # imm = 0x4C8 4429; AVX2-NEXT: vmovaps 480(%rdi), %ymm9 4430; AVX2-NEXT: vmovaps 448(%rdi), %ymm11 4431; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4432; AVX2-NEXT: vmovaps 416(%rdi), %ymm8 4433; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4434; AVX2-NEXT: vmovaps 128(%rdi), %ymm2 4435; AVX2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill 4436; AVX2-NEXT: vmovaps 160(%rdi), %ymm3 4437; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4438; AVX2-NEXT: vmovaps 96(%rdi), %ymm4 4439; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4440; AVX2-NEXT: vmovaps (%rdi), %ymm0 4441; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4442; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 4443; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4444; AVX2-NEXT: vmovaps 64(%rdi), %ymm5 4445; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4446; AVX2-NEXT: vmovaps {{.*#+}} xmm10 = [0,6,4,u] 4447; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 4448; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4449; AVX2-NEXT: vpermps %ymm0, %ymm10, %ymm0 4450; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] 4451; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm4[6,7] 4452; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2,2,2,4,6,6,6] 4453; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 4454; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7] 4455; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2] 4456; AVX2-NEXT: vpermps %ymm7, %ymm6, %ymm1 4457; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 4458; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4459; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4460; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[0,1],ymm9[0,1] 4461; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm9[6,7] 4462; AVX2-NEXT: vmovaps 384(%rdi), %ymm0 4463; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4464; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] 4465; AVX2-NEXT: vpermps %ymm3, %ymm10, %ymm0 4466; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] 4467; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 4468; AVX2-NEXT: vmovaps 512(%rdi), %ymm1 4469; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4470; AVX2-NEXT: vmovaps 544(%rdi), %ymm2 4471; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4472; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] 4473; AVX2-NEXT: vpermps %ymm2, %ymm6, %ymm1 4474; AVX2-NEXT: vmovaps %ymm6, %ymm9 4475; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 4476; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4477; AVX2-NEXT: vmovaps 288(%rdi), %ymm1 4478; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4479; AVX2-NEXT: vmovaps 256(%rdi), %ymm0 4480; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4481; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 4482; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] 4483; AVX2-NEXT: vmovaps 224(%rdi), %ymm0 4484; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4485; AVX2-NEXT: vmovaps 192(%rdi), %ymm6 4486; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4487; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] 4488; AVX2-NEXT: vpermps %ymm1, %ymm10, %ymm8 4489; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm13[0,2,2,2,4,6,6,6] 4490; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1,2],ymm11[3,4,5,6,7] 4491; AVX2-NEXT: vmovaps 320(%rdi), %ymm6 4492; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4493; AVX2-NEXT: vmovaps 352(%rdi), %ymm8 4494; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4495; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7] 4496; AVX2-NEXT: vpermps %ymm8, %ymm9, %ymm14 4497; AVX2-NEXT: vmovaps %ymm9, %ymm0 4498; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4499; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm14[6,7] 4500; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4501; AVX2-NEXT: vmovaps 608(%rdi), %ymm6 4502; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4503; AVX2-NEXT: vmovaps 576(%rdi), %ymm9 4504; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4505; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7] 4506; AVX2-NEXT: vpermps %ymm14, %ymm10, %ymm10 4507; AVX2-NEXT: vmovaps 672(%rdi), %ymm6 4508; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4509; AVX2-NEXT: vmovaps 640(%rdi), %ymm9 4510; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4511; AVX2-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm6[0,1] 4512; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm6[6,7] 4513; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm12[0,2,2,2,4,6,6,6] 4514; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4,5,6,7] 4515; AVX2-NEXT: vmovaps 704(%rdi), %ymm6 4516; AVX2-NEXT: vmovaps 736(%rdi), %ymm11 4517; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm6[4,5,6,7] 4518; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4519; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4520; AVX2-NEXT: vpermps %ymm15, %ymm0, %ymm9 4521; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] 4522; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4523; AVX2-NEXT: vmovaps {{.*#+}} xmm9 = [1,7,5,u] 4524; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload 4525; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,3,2,3,5,7,6,7] 4526; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7] 4527; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm10 = [5,3,5,3,5,3,5,3] 4528; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4529; AVX2-NEXT: vpermps %ymm7, %ymm10, %ymm7 4530; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] 4531; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4532; AVX2-NEXT: vpermps %ymm3, %ymm9, %ymm3 4533; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,3,2,3,5,7,6,7] 4534; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] 4535; AVX2-NEXT: vpermps %ymm2, %ymm10, %ymm2 4536; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] 4537; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4538; AVX2-NEXT: vpermps %ymm1, %ymm9, %ymm0 4539; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,3,2,3,5,7,6,7] 4540; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 4541; AVX2-NEXT: vpermps %ymm8, %ymm10, %ymm1 4542; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 4543; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4544; AVX2-NEXT: vpermps %ymm14, %ymm9, %ymm0 4545; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,3,2,3,5,7,6,7] 4546; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 4547; AVX2-NEXT: vpermps %ymm15, %ymm10, %ymm1 4548; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 4549; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4550; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 4551; AVX2-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload 4552; AVX2-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] 4553; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 4554; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 4555; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4556; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload 4557; AVX2-NEXT: # ymm3 = mem[0,1],ymm1[2,3],mem[4,5,6,7] 4558; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,0,2,3,6,4,6,7] 4559; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] 4560; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 4561; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload 4562; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload 4563; AVX2-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] 4564; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,0,2,0,4,4,6,4] 4565; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] 4566; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 4567; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4568; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4569; AVX2-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4570; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] 4571; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 4572; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 4573; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4574; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload 4575; AVX2-NEXT: # ymm15 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] 4576; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,0,2,3,6,4,6,7] 4577; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] 4578; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 4579; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4580; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload 4581; AVX2-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] 4582; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0,2,0,4,4,6,4] 4583; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] 4584; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 4585; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4586; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 4587; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 4588; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] 4589; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 4590; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,2,0,3] 4591; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4592; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4593; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] 4594; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm0[2,0,2,3,6,4,6,7] 4595; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3] 4596; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm1[3,4,5,6,7] 4597; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] 4598; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,0,2,0,4,4,6,4] 4599; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] 4600; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm8[5,6,7] 4601; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4602; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 4603; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 4604; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm11[2,3],ymm6[4,5],ymm11[6,7] 4605; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] 4606; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] 4607; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 4608; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload 4609; AVX2-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5,6,7] 4610; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm8[2,0,2,3,6,4,6,7] 4611; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3] 4612; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] 4613; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 4614; AVX2-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload 4615; AVX2-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5],mem[6,7] 4616; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm9[0,0,2,0,4,4,6,4] 4617; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] 4618; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm10[5,6,7] 4619; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4620; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,3,3,3,7,7,7,7] 4621; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 4622; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] 4623; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] 4624; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,3,3,7,5,7,7] 4625; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] 4626; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3,4,5,6,7] 4627; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,7,5] 4628; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] 4629; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] 4630; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4631; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 4632; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm5[3,3,3,3,7,7,7,7] 4633; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 4634; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4],ymm7[5],ymm3[6,7] 4635; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] 4636; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm15[3,1,3,3,7,5,7,7] 4637; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] 4638; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] 4639; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,1,4,5,7,5] 4640; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 4641; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] 4642; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4643; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] 4644; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7] 4645; AVX2-NEXT: vmovaps %ymm6, %ymm4 4646; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] 4647; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm8[3,1,3,3,7,5,7,7] 4648; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] 4649; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] 4650; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,3,1,4,5,7,5] 4651; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 4652; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] 4653; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4654; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] 4655; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] 4656; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] 4657; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,3,3,7,5,7,7] 4658; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] 4659; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 4660; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,7,5] 4661; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] 4662; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 4663; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4664; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm5[4,5,6,7] 4665; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4666; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4667; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 4668; AVX2-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] 4669; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4670; AVX2-NEXT: vmovaps 464(%rdi), %xmm0 4671; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4672; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 4673; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 4674; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 4675; AVX2-NEXT: vpermps %ymm3, %ymm8, %ymm1 4676; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 4677; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4678; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload 4679; AVX2-NEXT: # ymm7 = mem[0,1],ymm1[2,3],mem[4,5,6,7] 4680; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6] 4681; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 4682; AVX2-NEXT: vpermps %ymm7, %ymm3, %ymm1 4683; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 4684; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4685; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] 4686; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4687; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 4688; AVX2-NEXT: # ymm14 = mem[0,1,2,3],ymm0[4,5,6,7] 4689; AVX2-NEXT: vmovaps 80(%rdi), %xmm5 4690; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] 4691; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 4692; AVX2-NEXT: vpermps %ymm14, %ymm8, %ymm1 4693; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 4694; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload 4695; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload 4696; AVX2-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] 4697; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm1 4698; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 4699; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4700; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm11[4,5,6,7] 4701; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4702; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload 4703; AVX2-NEXT: # ymm12 = mem[0,1,2,3],ymm0[4,5,6,7] 4704; AVX2-NEXT: vmovaps 272(%rdi), %xmm4 4705; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3],ymm9[4,5,6,7] 4706; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 4707; AVX2-NEXT: vpermps %ymm12, %ymm8, %ymm11 4708; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3,4,5,6,7] 4709; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4710; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 4711; AVX2-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 4712; AVX2-NEXT: vpermps %ymm1, %ymm3, %ymm15 4713; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm15[5,6,7] 4714; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 4715; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload 4716; AVX2-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7] 4717; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4718; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload 4719; AVX2-NEXT: # ymm15 = mem[0,1,2,3],ymm0[4,5,6,7] 4720; AVX2-NEXT: vmovaps 656(%rdi), %xmm0 4721; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm0[2,3],ymm11[4,5,6,7] 4722; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] 4723; AVX2-NEXT: vpermps %ymm15, %ymm8, %ymm8 4724; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5,6,7] 4725; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 4726; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload 4727; AVX2-NEXT: # ymm10 = ymm10[0,1],mem[2,3],ymm10[4,5,6,7] 4728; AVX2-NEXT: vpermps %ymm10, %ymm3, %ymm3 4729; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7] 4730; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,1,1,1,5,5,5,5] 4731; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6,7] 4732; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 4733; AVX2-NEXT: vpermps %ymm14, %ymm13, %ymm6 4734; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] 4735; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,3,1,7,0,3,1,7] 4736; AVX2-NEXT: # ymm6 = mem[0,1,0,1] 4737; AVX2-NEXT: vpermps %ymm2, %ymm6, %ymm2 4738; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] 4739; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 4740; AVX2-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] 4741; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 4742; AVX2-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] 4743; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload 4744; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] 4745; AVX2-NEXT: vpermps %ymm7, %ymm6, %ymm7 4746; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7] 4747; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5] 4748; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] 4749; AVX2-NEXT: vpermps %ymm12, %ymm13, %ymm7 4750; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] 4751; AVX2-NEXT: vpermps %ymm1, %ymm6, %ymm1 4752; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] 4753; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,1,1,1,5,5,5,5] 4754; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] 4755; AVX2-NEXT: vpermps %ymm15, %ymm13, %ymm4 4756; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] 4757; AVX2-NEXT: vpermps %ymm10, %ymm6, %ymm4 4758; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] 4759; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4760; AVX2-NEXT: vmovaps %ymm4, 96(%rsi) 4761; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4762; AVX2-NEXT: vmovaps %ymm4, 32(%rsi) 4763; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4764; AVX2-NEXT: vmovaps %ymm4, 64(%rsi) 4765; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4766; AVX2-NEXT: vmovaps %ymm4, (%rsi) 4767; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4768; AVX2-NEXT: vmovaps %ymm4, 96(%rdx) 4769; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4770; AVX2-NEXT: vmovaps %ymm4, 32(%rdx) 4771; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4772; AVX2-NEXT: vmovaps %ymm4, 64(%rdx) 4773; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4774; AVX2-NEXT: vmovaps %ymm4, (%rdx) 4775; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4776; AVX2-NEXT: vmovaps %ymm4, 32(%rcx) 4777; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4778; AVX2-NEXT: vmovaps %ymm4, 96(%rcx) 4779; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4780; AVX2-NEXT: vmovaps %ymm4, 64(%rcx) 4781; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4782; AVX2-NEXT: vmovaps %ymm4, (%rcx) 4783; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4784; AVX2-NEXT: vmovaps %ymm4, 96(%r8) 4785; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4786; AVX2-NEXT: vmovaps %ymm4, 32(%r8) 4787; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4788; AVX2-NEXT: vmovaps %ymm4, 64(%r8) 4789; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 4790; AVX2-NEXT: vmovaps %ymm4, (%r8) 4791; AVX2-NEXT: vmovaps %ymm3, 96(%r9) 4792; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload 4793; AVX2-NEXT: vmovaps %ymm3, 32(%r9) 4794; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 4795; AVX2-NEXT: vmovaps %ymm3, (%r9) 4796; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 4797; AVX2-NEXT: vmovaps %ymm3, 64(%r9) 4798; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 4799; AVX2-NEXT: vmovaps %ymm0, 96(%rax) 4800; AVX2-NEXT: vmovaps %ymm1, 32(%rax) 4801; AVX2-NEXT: vmovaps %ymm5, 64(%rax) 4802; AVX2-NEXT: vmovaps %ymm2, (%rax) 4803; AVX2-NEXT: addq $1224, %rsp # imm = 0x4C8 4804; AVX2-NEXT: vzeroupper 4805; AVX2-NEXT: retq 4806; 4807; AVX2-FP-LABEL: load_i32_stride6_vf32: 4808; AVX2-FP: # %bb.0: 4809; AVX2-FP-NEXT: subq $1224, %rsp # imm = 0x4C8 4810; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm9 4811; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm11 4812; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4813; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm8 4814; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4815; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm2 4816; AVX2-FP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill 4817; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm3 4818; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4819; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm4 4820; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4821; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 4822; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4823; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1 4824; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4825; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm5 4826; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4827; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm10 = [0,6,4,u] 4828; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 4829; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4830; AVX2-FP-NEXT: vpermps %ymm0, %ymm10, %ymm0 4831; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] 4832; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm4[6,7] 4833; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2,2,2,4,6,6,6] 4834; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 4835; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7] 4836; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2] 4837; AVX2-FP-NEXT: vpermps %ymm7, %ymm6, %ymm1 4838; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 4839; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4840; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4841; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[0,1],ymm9[0,1] 4842; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm9[6,7] 4843; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm0 4844; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4845; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] 4846; AVX2-FP-NEXT: vpermps %ymm3, %ymm10, %ymm0 4847; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] 4848; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 4849; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm1 4850; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4851; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm2 4852; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4853; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] 4854; AVX2-FP-NEXT: vpermps %ymm2, %ymm6, %ymm1 4855; AVX2-FP-NEXT: vmovaps %ymm6, %ymm9 4856; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 4857; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4858; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm1 4859; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4860; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm0 4861; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4862; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 4863; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] 4864; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm0 4865; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4866; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm6 4867; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4868; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] 4869; AVX2-FP-NEXT: vpermps %ymm1, %ymm10, %ymm8 4870; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm13[0,2,2,2,4,6,6,6] 4871; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1,2],ymm11[3,4,5,6,7] 4872; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm6 4873; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4874; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm8 4875; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4876; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7] 4877; AVX2-FP-NEXT: vpermps %ymm8, %ymm9, %ymm14 4878; AVX2-FP-NEXT: vmovaps %ymm9, %ymm0 4879; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4880; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm14[6,7] 4881; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4882; AVX2-FP-NEXT: vmovaps 608(%rdi), %ymm6 4883; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4884; AVX2-FP-NEXT: vmovaps 576(%rdi), %ymm9 4885; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4886; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7] 4887; AVX2-FP-NEXT: vpermps %ymm14, %ymm10, %ymm10 4888; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm6 4889; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4890; AVX2-FP-NEXT: vmovaps 640(%rdi), %ymm9 4891; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4892; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm6[0,1] 4893; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm6[6,7] 4894; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm12[0,2,2,2,4,6,6,6] 4895; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4,5,6,7] 4896; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm6 4897; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm11 4898; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm6[4,5,6,7] 4899; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4900; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4901; AVX2-FP-NEXT: vpermps %ymm15, %ymm0, %ymm9 4902; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] 4903; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4904; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm9 = [1,7,5,u] 4905; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload 4906; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,3,2,3,5,7,6,7] 4907; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7] 4908; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm10 = [5,3,5,3,5,3,5,3] 4909; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4910; AVX2-FP-NEXT: vpermps %ymm7, %ymm10, %ymm7 4911; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] 4912; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4913; AVX2-FP-NEXT: vpermps %ymm3, %ymm9, %ymm3 4914; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,3,2,3,5,7,6,7] 4915; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] 4916; AVX2-FP-NEXT: vpermps %ymm2, %ymm10, %ymm2 4917; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] 4918; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4919; AVX2-FP-NEXT: vpermps %ymm1, %ymm9, %ymm0 4920; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,3,2,3,5,7,6,7] 4921; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 4922; AVX2-FP-NEXT: vpermps %ymm8, %ymm10, %ymm1 4923; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 4924; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4925; AVX2-FP-NEXT: vpermps %ymm14, %ymm9, %ymm0 4926; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,3,2,3,5,7,6,7] 4927; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 4928; AVX2-FP-NEXT: vpermps %ymm15, %ymm10, %ymm1 4929; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 4930; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4931; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 4932; AVX2-FP-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload 4933; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] 4934; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 4935; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 4936; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4937; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload 4938; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm1[2,3],mem[4,5,6,7] 4939; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,0,2,3,6,4,6,7] 4940; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] 4941; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 4942; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload 4943; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload 4944; AVX2-FP-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] 4945; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,0,2,0,4,4,6,4] 4946; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] 4947; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 4948; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4949; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4950; AVX2-FP-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4951; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] 4952; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 4953; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 4954; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4955; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload 4956; AVX2-FP-NEXT: # ymm15 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] 4957; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,0,2,3,6,4,6,7] 4958; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] 4959; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 4960; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4961; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload 4962; AVX2-FP-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] 4963; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0,2,0,4,4,6,4] 4964; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] 4965; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 4966; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4967; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 4968; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 4969; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] 4970; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 4971; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,2,0,3] 4972; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4973; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4974; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] 4975; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm0[2,0,2,3,6,4,6,7] 4976; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3] 4977; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm1[3,4,5,6,7] 4978; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] 4979; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,0,2,0,4,4,6,4] 4980; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] 4981; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm8[5,6,7] 4982; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4983; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 4984; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 4985; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm11[2,3],ymm6[4,5],ymm11[6,7] 4986; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] 4987; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] 4988; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 4989; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload 4990; AVX2-FP-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5,6,7] 4991; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm8[2,0,2,3,6,4,6,7] 4992; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3] 4993; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] 4994; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 4995; AVX2-FP-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload 4996; AVX2-FP-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5],mem[6,7] 4997; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm9[0,0,2,0,4,4,6,4] 4998; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] 4999; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm10[5,6,7] 5000; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5001; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,3,3,3,7,7,7,7] 5002; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 5003; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] 5004; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] 5005; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,3,3,7,5,7,7] 5006; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] 5007; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3,4,5,6,7] 5008; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,7,5] 5009; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] 5010; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] 5011; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5012; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5013; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm5[3,3,3,3,7,7,7,7] 5014; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 5015; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4],ymm7[5],ymm3[6,7] 5016; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] 5017; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm15[3,1,3,3,7,5,7,7] 5018; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] 5019; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] 5020; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,1,4,5,7,5] 5021; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 5022; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] 5023; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5024; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] 5025; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7] 5026; AVX2-FP-NEXT: vmovaps %ymm6, %ymm4 5027; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] 5028; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm8[3,1,3,3,7,5,7,7] 5029; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] 5030; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] 5031; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,3,1,4,5,7,5] 5032; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 5033; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] 5034; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5035; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] 5036; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] 5037; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] 5038; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,3,3,7,5,7,7] 5039; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] 5040; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 5041; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,7,5] 5042; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] 5043; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 5044; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5045; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm5[4,5,6,7] 5046; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5047; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5048; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 5049; AVX2-FP-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] 5050; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5051; AVX2-FP-NEXT: vmovaps 464(%rdi), %xmm0 5052; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5053; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 5054; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 5055; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 5056; AVX2-FP-NEXT: vpermps %ymm3, %ymm8, %ymm1 5057; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 5058; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5059; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload 5060; AVX2-FP-NEXT: # ymm7 = mem[0,1],ymm1[2,3],mem[4,5,6,7] 5061; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6] 5062; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] 5063; AVX2-FP-NEXT: vpermps %ymm7, %ymm3, %ymm1 5064; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 5065; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5066; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] 5067; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5068; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 5069; AVX2-FP-NEXT: # ymm14 = mem[0,1,2,3],ymm0[4,5,6,7] 5070; AVX2-FP-NEXT: vmovaps 80(%rdi), %xmm5 5071; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] 5072; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 5073; AVX2-FP-NEXT: vpermps %ymm14, %ymm8, %ymm1 5074; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 5075; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload 5076; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload 5077; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] 5078; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm1 5079; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 5080; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5081; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm11[4,5,6,7] 5082; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5083; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload 5084; AVX2-FP-NEXT: # ymm12 = mem[0,1,2,3],ymm0[4,5,6,7] 5085; AVX2-FP-NEXT: vmovaps 272(%rdi), %xmm4 5086; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3],ymm9[4,5,6,7] 5087; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 5088; AVX2-FP-NEXT: vpermps %ymm12, %ymm8, %ymm11 5089; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3,4,5,6,7] 5090; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5091; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 5092; AVX2-FP-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 5093; AVX2-FP-NEXT: vpermps %ymm1, %ymm3, %ymm15 5094; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm15[5,6,7] 5095; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 5096; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload 5097; AVX2-FP-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7] 5098; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5099; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload 5100; AVX2-FP-NEXT: # ymm15 = mem[0,1,2,3],ymm0[4,5,6,7] 5101; AVX2-FP-NEXT: vmovaps 656(%rdi), %xmm0 5102; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm0[2,3],ymm11[4,5,6,7] 5103; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] 5104; AVX2-FP-NEXT: vpermps %ymm15, %ymm8, %ymm8 5105; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5,6,7] 5106; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 5107; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload 5108; AVX2-FP-NEXT: # ymm10 = ymm10[0,1],mem[2,3],ymm10[4,5,6,7] 5109; AVX2-FP-NEXT: vpermps %ymm10, %ymm3, %ymm3 5110; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7] 5111; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,1,1,1,5,5,5,5] 5112; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6,7] 5113; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 5114; AVX2-FP-NEXT: vpermps %ymm14, %ymm13, %ymm6 5115; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] 5116; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,3,1,7,0,3,1,7] 5117; AVX2-FP-NEXT: # ymm6 = mem[0,1,0,1] 5118; AVX2-FP-NEXT: vpermps %ymm2, %ymm6, %ymm2 5119; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] 5120; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 5121; AVX2-FP-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] 5122; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 5123; AVX2-FP-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] 5124; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload 5125; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] 5126; AVX2-FP-NEXT: vpermps %ymm7, %ymm6, %ymm7 5127; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7] 5128; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5] 5129; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] 5130; AVX2-FP-NEXT: vpermps %ymm12, %ymm13, %ymm7 5131; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] 5132; AVX2-FP-NEXT: vpermps %ymm1, %ymm6, %ymm1 5133; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] 5134; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,1,1,1,5,5,5,5] 5135; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] 5136; AVX2-FP-NEXT: vpermps %ymm15, %ymm13, %ymm4 5137; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] 5138; AVX2-FP-NEXT: vpermps %ymm10, %ymm6, %ymm4 5139; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] 5140; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5141; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rsi) 5142; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5143; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi) 5144; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5145; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rsi) 5146; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5147; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) 5148; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5149; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rdx) 5150; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5151; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx) 5152; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5153; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rdx) 5154; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5155; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx) 5156; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5157; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx) 5158; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5159; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rcx) 5160; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5161; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rcx) 5162; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5163; AVX2-FP-NEXT: vmovaps %ymm4, (%rcx) 5164; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5165; AVX2-FP-NEXT: vmovaps %ymm4, 96(%r8) 5166; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5167; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r8) 5168; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5169; AVX2-FP-NEXT: vmovaps %ymm4, 64(%r8) 5170; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5171; AVX2-FP-NEXT: vmovaps %ymm4, (%r8) 5172; AVX2-FP-NEXT: vmovaps %ymm3, 96(%r9) 5173; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload 5174; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r9) 5175; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5176; AVX2-FP-NEXT: vmovaps %ymm3, (%r9) 5177; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5178; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r9) 5179; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 5180; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rax) 5181; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax) 5182; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rax) 5183; AVX2-FP-NEXT: vmovaps %ymm2, (%rax) 5184; AVX2-FP-NEXT: addq $1224, %rsp # imm = 0x4C8 5185; AVX2-FP-NEXT: vzeroupper 5186; AVX2-FP-NEXT: retq 5187; 5188; AVX2-FCP-LABEL: load_i32_stride6_vf32: 5189; AVX2-FCP: # %bb.0: 5190; AVX2-FCP-NEXT: subq $1192, %rsp # imm = 0x4A8 5191; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm6 5192; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm11 5193; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5194; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm10 5195; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5196; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm2 5197; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5198; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm3 5199; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5200; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm4 5201; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5202; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 5203; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 5204; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 5205; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5206; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm5 5207; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5208; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm9 = [0,6,4,u] 5209; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 5210; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5211; AVX2-FCP-NEXT: vpermps %ymm0, %ymm9, %ymm0 5212; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] 5213; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm4[6,7] 5214; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] 5215; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 5216; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7] 5217; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm5 = [4,2,4,2,4,2,4,2] 5218; AVX2-FCP-NEXT: vpermps %ymm7, %ymm5, %ymm1 5219; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 5220; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5221; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5222; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[0,1],ymm6[0,1] 5223; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] 5224; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm0 5225; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5226; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm10[4,5],ymm0[6,7] 5227; AVX2-FCP-NEXT: vpermps %ymm4, %ymm9, %ymm0 5228; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] 5229; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 5230; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm1 5231; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5232; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm3 5233; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5234; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] 5235; AVX2-FCP-NEXT: vpermps %ymm3, %ymm5, %ymm1 5236; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 5237; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5238; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm1 5239; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5240; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm0 5241; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5242; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 5243; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] 5244; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm0 5245; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5246; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm1 5247; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5248; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 5249; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm0 5250; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,2,2,2,4,6,6,6] 5251; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7] 5252; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm10 5253; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5254; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm0 5255; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5256; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm10[4,5,6,7] 5257; AVX2-FCP-NEXT: vpermps %ymm13, %ymm5, %ymm15 5258; AVX2-FCP-NEXT: vmovaps %ymm5, %ymm0 5259; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5260; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm15[6,7] 5261; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5262; AVX2-FCP-NEXT: vmovaps 608(%rdi), %ymm10 5263; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5264; AVX2-FCP-NEXT: vmovaps 576(%rdi), %ymm11 5265; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5266; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3],ymm10[4,5],ymm11[6,7] 5267; AVX2-FCP-NEXT: vpermps %ymm14, %ymm9, %ymm15 5268; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm5 5269; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5270; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm9 5271; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5272; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm5[0,1] 5273; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm5[6,7] 5274; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,2,2,2,4,6,6,6] 5275; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2],ymm11[3,4,5,6,7] 5276; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm9 5277; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5278; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm10 5279; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5280; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm9[4,5,6,7] 5281; AVX2-FCP-NEXT: vpermps %ymm15, %ymm0, %ymm10 5282; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm10[6,7] 5283; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5284; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm10 = [1,7,5,u] 5285; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload 5286; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] 5287; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7] 5288; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm5 = [5,3,5,3,5,3,5,3] 5289; AVX2-FCP-NEXT: vpermps %ymm7, %ymm5, %ymm7 5290; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] 5291; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5292; AVX2-FCP-NEXT: vpermps %ymm4, %ymm10, %ymm4 5293; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] 5294; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] 5295; AVX2-FCP-NEXT: vpermps %ymm3, %ymm5, %ymm3 5296; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] 5297; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5298; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm1 5299; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] 5300; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 5301; AVX2-FCP-NEXT: vpermps %ymm13, %ymm5, %ymm0 5302; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 5303; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5304; AVX2-FCP-NEXT: vpermps %ymm14, %ymm10, %ymm0 5305; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,3,2,3,5,7,6,7] 5306; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 5307; AVX2-FCP-NEXT: vpermps %ymm15, %ymm5, %ymm1 5308; AVX2-FCP-NEXT: vmovaps %ymm5, %ymm13 5309; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 5310; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5311; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 5312; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload 5313; AVX2-FCP-NEXT: # ymm0 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] 5314; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm7 = [2,0,6,4,2,0,6,7] 5315; AVX2-FCP-NEXT: vpermps %ymm0, %ymm7, %ymm0 5316; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm8 = [2,0,6,7] 5317; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5318; AVX2-FCP-NEXT: vblendps $12, (%rsp), %ymm1, %ymm3 # 32-byte Folded Reload 5319; AVX2-FCP-NEXT: # ymm3 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] 5320; AVX2-FCP-NEXT: vpermps %ymm3, %ymm8, %ymm1 5321; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 5322; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5323; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload 5324; AVX2-FCP-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] 5325; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,0,6,4,0,0,6,4] 5326; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] 5327; AVX2-FCP-NEXT: vpermps %ymm2, %ymm10, %ymm1 5328; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 5329; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5330; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5331; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload 5332; AVX2-FCP-NEXT: # ymm0 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] 5333; AVX2-FCP-NEXT: vpermps %ymm0, %ymm7, %ymm0 5334; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5335; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload 5336; AVX2-FCP-NEXT: # ymm14 = mem[0,1],ymm1[2,3],mem[4,5,6,7] 5337; AVX2-FCP-NEXT: vpermps %ymm14, %ymm8, %ymm1 5338; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 5339; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5340; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 5341; AVX2-FCP-NEXT: # ymm6 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] 5342; AVX2-FCP-NEXT: vpermps %ymm6, %ymm10, %ymm1 5343; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 5344; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5345; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5346; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload 5347; AVX2-FCP-NEXT: # ymm0 = ymm5[0,1],mem[2,3],ymm5[4,5],mem[6,7] 5348; AVX2-FCP-NEXT: vpermps %ymm0, %ymm7, %ymm1 5349; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5350; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 5351; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 5352; AVX2-FCP-NEXT: vpermps %ymm0, %ymm8, %ymm11 5353; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm1[3,4,5,6,7] 5354; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5355; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 5356; AVX2-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] 5357; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm12 5358; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm12[5,6,7] 5359; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5360; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 5361; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload 5362; AVX2-FCP-NEXT: # ymm11 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7] 5363; AVX2-FCP-NEXT: vpermps %ymm11, %ymm7, %ymm7 5364; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 5365; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 5366; AVX2-FCP-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] 5367; AVX2-FCP-NEXT: vpermps %ymm11, %ymm8, %ymm8 5368; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] 5369; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 5370; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload 5371; AVX2-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] 5372; AVX2-FCP-NEXT: vpermps %ymm8, %ymm10, %ymm10 5373; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7] 5374; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5375; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm7 = mem[3,3,3,3] 5376; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm15[1],ymm7[2,3,4],ymm15[5],ymm7[6,7] 5377; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4],ymm7[5],ymm3[6,7] 5378; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [0,1,7,5,0,1,7,5] 5379; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1] 5380; AVX2-FCP-NEXT: vpermps %ymm2, %ymm7, %ymm2 5381; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm10 = [3,1,7,5,0,u,u,u] 5382; AVX2-FCP-NEXT: vpermps %ymm3, %ymm10, %ymm3 5383; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] 5384; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5385; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm2 = mem[3,3,3,3] 5386; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7] 5387; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1,2,3,4],ymm2[5],ymm14[6,7] 5388; AVX2-FCP-NEXT: vpermps %ymm2, %ymm10, %ymm2 5389; AVX2-FCP-NEXT: vpermps %ymm6, %ymm7, %ymm3 5390; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] 5391; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5392; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm2 = mem[3,3,3,3] 5393; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] 5394; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1,2,3,4],ymm2[5],ymm11[6,7] 5395; AVX2-FCP-NEXT: vpermps %ymm2, %ymm10, %ymm2 5396; AVX2-FCP-NEXT: vpermps %ymm8, %ymm7, %ymm3 5397; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] 5398; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5399; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm2 = mem[3,3,3,3] 5400; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] 5401; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] 5402; AVX2-FCP-NEXT: vpermps %ymm0, %ymm10, %ymm0 5403; AVX2-FCP-NEXT: vpermps %ymm1, %ymm7, %ymm1 5404; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 5405; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5406; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload 5407; AVX2-FCP-NEXT: # ymm1 = ymm4[0,1,2,3],mem[4,5,6,7] 5408; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5409; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5410; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 5411; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3],ymm0[4,5,6,7] 5412; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5413; AVX2-FCP-NEXT: vmovaps 464(%rdi), %xmm0 5414; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5415; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 5416; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 5417; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5418; AVX2-FCP-NEXT: vpermps %ymm3, %ymm5, %ymm1 5419; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 5420; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5421; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 5422; AVX2-FCP-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] 5423; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,0,6,0,2,0,6] 5424; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] 5425; AVX2-FCP-NEXT: vpermps %ymm6, %ymm4, %ymm1 5426; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 5427; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5428; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload 5429; AVX2-FCP-NEXT: # ymm3 = ymm15[0,1,2,3],mem[4,5,6,7] 5430; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5431; AVX2-FCP-NEXT: vblendps $240, (%rsp), %ymm0, %ymm12 # 32-byte Folded Reload 5432; AVX2-FCP-NEXT: # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7] 5433; AVX2-FCP-NEXT: vmovaps 80(%rdi), %xmm14 5434; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] 5435; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 5436; AVX2-FCP-NEXT: vpermps %ymm12, %ymm5, %ymm1 5437; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] 5438; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5439; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 5440; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] 5441; AVX2-FCP-NEXT: vpermps %ymm2, %ymm4, %ymm8 5442; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm8[5,6,7] 5443; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 5444; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload 5445; AVX2-FCP-NEXT: # ymm10 = ymm9[0,1,2,3],mem[4,5,6,7] 5446; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5447; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload 5448; AVX2-FCP-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] 5449; AVX2-FCP-NEXT: vmovaps 272(%rdi), %xmm1 5450; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm1[2,3],ymm10[4,5,6,7] 5451; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] 5452; AVX2-FCP-NEXT: vpermps %ymm11, %ymm5, %ymm9 5453; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm8[2,3,4,5,6,7] 5454; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5455; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload 5456; AVX2-FCP-NEXT: # ymm8 = mem[0,1],ymm0[2,3],mem[4,5,6,7] 5457; AVX2-FCP-NEXT: vpermps %ymm8, %ymm4, %ymm15 5458; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm15[5,6,7] 5459; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5460; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5461; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload 5462; AVX2-FCP-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] 5463; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5464; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload 5465; AVX2-FCP-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] 5466; AVX2-FCP-NEXT: vmovaps 656(%rdi), %xmm0 5467; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm0[2,3],ymm9[4,5,6,7] 5468; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] 5469; AVX2-FCP-NEXT: vpermps %ymm15, %ymm5, %ymm5 5470; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3,4,5,6,7] 5471; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 5472; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload 5473; AVX2-FCP-NEXT: # ymm7 = mem[0,1],ymm7[2,3],mem[4,5,6,7] 5474; AVX2-FCP-NEXT: vpermps %ymm7, %ymm4, %ymm4 5475; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] 5476; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5] 5477; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] 5478; AVX2-FCP-NEXT: vpermps %ymm12, %ymm13, %ymm5 5479; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] 5480; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,3,1,7,0,3,1,7] 5481; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] 5482; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2 5483; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] 5484; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 5485; AVX2-FCP-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] 5486; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 5487; AVX2-FCP-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] 5488; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm12 # 32-byte Folded Reload 5489; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1],ymm3[2,3,4,5,6,7] 5490; AVX2-FCP-NEXT: vpermps %ymm6, %ymm5, %ymm6 5491; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] 5492; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm10[1,1,1,1,5,5,5,5] 5493; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6,7] 5494; AVX2-FCP-NEXT: vpermps %ymm11, %ymm13, %ymm6 5495; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7] 5496; AVX2-FCP-NEXT: vpermps %ymm8, %ymm5, %ymm6 5497; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] 5498; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5] 5499; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6,7] 5500; AVX2-FCP-NEXT: vpermps %ymm15, %ymm13, %ymm6 5501; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] 5502; AVX2-FCP-NEXT: vpermps %ymm7, %ymm5, %ymm5 5503; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] 5504; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5505; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rsi) 5506; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5507; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rsi) 5508; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5509; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rsi) 5510; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5511; AVX2-FCP-NEXT: vmovaps %ymm5, (%rsi) 5512; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5513; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rdx) 5514; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5515; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rdx) 5516; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5517; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rdx) 5518; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5519; AVX2-FCP-NEXT: vmovaps %ymm5, (%rdx) 5520; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5521; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rcx) 5522; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5523; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rcx) 5524; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5525; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rcx) 5526; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5527; AVX2-FCP-NEXT: vmovaps %ymm5, (%rcx) 5528; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5529; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%r8) 5530; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5531; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%r8) 5532; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5533; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%r8) 5534; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5535; AVX2-FCP-NEXT: vmovaps %ymm5, (%r8) 5536; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%r9) 5537; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5538; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r9) 5539; AVX2-FCP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload 5540; AVX2-FCP-NEXT: vmovaps %ymm4, (%r9) 5541; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5542; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%r9) 5543; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 5544; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rax) 5545; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax) 5546; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rax) 5547; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax) 5548; AVX2-FCP-NEXT: addq $1192, %rsp # imm = 0x4A8 5549; AVX2-FCP-NEXT: vzeroupper 5550; AVX2-FCP-NEXT: retq 5551; 5552; AVX512-LABEL: load_i32_stride6_vf32: 5553; AVX512: # %bb.0: 5554; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 5555; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm0 5556; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm3 5557; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm2 5558; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm5 5559; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm6 5560; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm4 5561; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm10 5562; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm9 5563; AVX512-NEXT: vmovdqa64 (%rdi), %zmm11 5564; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 5565; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm12 5566; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm13 5567; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 5568; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 5569; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 5570; AVX512-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 5571; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] 5572; AVX512-NEXT: vmovdqa64 %zmm11, %zmm8 5573; AVX512-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 5574; AVX512-NEXT: movb $56, %dil 5575; AVX512-NEXT: kmovw %edi, %k2 5576; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} 5577; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 5578; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 5579; AVX512-NEXT: vmovdqa64 %zmm9, %zmm16 5580; AVX512-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 5581; AVX512-NEXT: movw $-2048, %di # imm = 0xF800 5582; AVX512-NEXT: kmovw %edi, %k1 5583; AVX512-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} 5584; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 5585; AVX512-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 5586; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 5587; AVX512-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} 5588; AVX512-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} 5589; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 5590; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 5591; AVX512-NEXT: vmovdqa64 %zmm13, %zmm17 5592; AVX512-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 5593; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] 5594; AVX512-NEXT: vmovdqa64 %zmm11, %zmm15 5595; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 5596; AVX512-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} 5597; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 5598; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 5599; AVX512-NEXT: vmovdqa64 %zmm9, %zmm18 5600; AVX512-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 5601; AVX512-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} 5602; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 5603; AVX512-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 5604; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 5605; AVX512-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} 5606; AVX512-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} 5607; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] 5608; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 5609; AVX512-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 5610; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 5611; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 5612; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 5613; AVX512-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 5614; AVX512-NEXT: movw $31, %di 5615; AVX512-NEXT: kmovw %edi, %k2 5616; AVX512-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} 5617; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 5618; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 5619; AVX512-NEXT: vmovdqa64 %zmm10, %zmm20 5620; AVX512-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 5621; AVX512-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} 5622; AVX512-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 5623; AVX512-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 5624; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 5625; AVX512-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} 5626; AVX512-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} 5627; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] 5628; AVX512-NEXT: vmovdqa64 %zmm11, %zmm21 5629; AVX512-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 5630; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 5631; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 5632; AVX512-NEXT: vmovdqa64 %zmm12, %zmm18 5633; AVX512-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 5634; AVX512-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} 5635; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 5636; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 5637; AVX512-NEXT: vmovdqa64 %zmm10, %zmm22 5638; AVX512-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 5639; AVX512-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} 5640; AVX512-NEXT: vpermi2d %zmm4, %zmm6, %zmm21 5641; AVX512-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 5642; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 5643; AVX512-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} 5644; AVX512-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} 5645; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 5646; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 5647; AVX512-NEXT: vmovdqa64 %zmm12, %zmm21 5648; AVX512-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 5649; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] 5650; AVX512-NEXT: vmovdqa64 %zmm1, %zmm23 5651; AVX512-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 5652; AVX512-NEXT: movw $992, %di # imm = 0x3E0 5653; AVX512-NEXT: kmovw %edi, %k1 5654; AVX512-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} 5655; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 5656; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 5657; AVX512-NEXT: vmovdqa64 %zmm9, %zmm24 5658; AVX512-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 5659; AVX512-NEXT: movb $-32, %dil 5660; AVX512-NEXT: kmovw %edi, %k2 5661; AVX512-NEXT: vmovdqa64 %zmm24, %zmm23 {%k2} 5662; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm21 5663; AVX512-NEXT: vpermi2d %zmm5, %zmm2, %zmm20 5664; AVX512-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 5665; AVX512-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} 5666; AVX512-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} 5667; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 5668; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 5669; AVX512-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 5670; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] 5671; AVX512-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 5672; AVX512-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} 5673; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 5674; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 5675; AVX512-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 5676; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} 5677; AVX512-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 5678; AVX512-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 5679; AVX512-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 5680; AVX512-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} 5681; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} 5682; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rsi) 5683; AVX512-NEXT: vmovdqa64 %zmm8, (%rsi) 5684; AVX512-NEXT: vmovdqa64 %zmm14, 64(%rdx) 5685; AVX512-NEXT: vmovdqa64 %zmm15, (%rdx) 5686; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rcx) 5687; AVX512-NEXT: vmovdqa64 %zmm16, (%rcx) 5688; AVX512-NEXT: vmovdqa64 %zmm19, 64(%r8) 5689; AVX512-NEXT: vmovdqa64 %zmm18, (%r8) 5690; AVX512-NEXT: vmovdqa64 %zmm22, 64(%r9) 5691; AVX512-NEXT: vmovdqa64 %zmm23, (%r9) 5692; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax) 5693; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) 5694; AVX512-NEXT: vzeroupper 5695; AVX512-NEXT: retq 5696; 5697; AVX512-FCP-LABEL: load_i32_stride6_vf32: 5698; AVX512-FCP: # %bb.0: 5699; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 5700; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 5701; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 5702; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 5703; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm5 5704; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 5705; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 5706; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm10 5707; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 5708; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 5709; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 5710; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12 5711; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 5712; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 5713; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 5714; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 5715; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 5716; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] 5717; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 5718; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 5719; AVX512-FCP-NEXT: movb $56, %dil 5720; AVX512-FCP-NEXT: kmovw %edi, %k2 5721; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} 5722; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 5723; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 5724; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 5725; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 5726; AVX512-FCP-NEXT: movw $-2048, %di # imm = 0xF800 5727; AVX512-FCP-NEXT: kmovw %edi, %k1 5728; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} 5729; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 5730; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 5731; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 5732; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} 5733; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} 5734; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 5735; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 5736; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 5737; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 5738; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] 5739; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 5740; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 5741; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} 5742; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 5743; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 5744; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 5745; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 5746; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} 5747; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 5748; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 5749; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 5750; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} 5751; AVX512-FCP-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} 5752; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] 5753; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 5754; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 5755; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 5756; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 5757; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 5758; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 5759; AVX512-FCP-NEXT: movw $31, %di 5760; AVX512-FCP-NEXT: kmovw %edi, %k2 5761; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} 5762; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 5763; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 5764; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm20 5765; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 5766; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} 5767; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 5768; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 5769; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 5770; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} 5771; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} 5772; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] 5773; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 5774; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 5775; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 5776; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 5777; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 5778; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 5779; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} 5780; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 5781; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 5782; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 5783; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 5784; AVX512-FCP-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} 5785; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm21 5786; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 5787; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 5788; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} 5789; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} 5790; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 5791; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 5792; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 5793; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 5794; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] 5795; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 5796; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 5797; AVX512-FCP-NEXT: movw $992, %di # imm = 0x3E0 5798; AVX512-FCP-NEXT: kmovw %edi, %k1 5799; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} 5800; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 5801; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 5802; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 5803; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 5804; AVX512-FCP-NEXT: movb $-32, %dil 5805; AVX512-FCP-NEXT: kmovw %edi, %k2 5806; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k2} 5807; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm21 5808; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm20 5809; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 5810; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} 5811; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} 5812; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 5813; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 5814; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 5815; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] 5816; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 5817; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} 5818; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 5819; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 5820; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 5821; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} 5822; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 5823; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 5824; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 5825; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} 5826; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} 5827; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) 5828; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) 5829; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%rdx) 5830; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rdx) 5831; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rcx) 5832; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) 5833; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%r8) 5834; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%r8) 5835; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%r9) 5836; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r9) 5837; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) 5838; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) 5839; AVX512-FCP-NEXT: vzeroupper 5840; AVX512-FCP-NEXT: retq 5841; 5842; AVX512DQ-LABEL: load_i32_stride6_vf32: 5843; AVX512DQ: # %bb.0: 5844; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 5845; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm0 5846; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm3 5847; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm2 5848; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm5 5849; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm6 5850; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm4 5851; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm10 5852; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm9 5853; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm11 5854; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 5855; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm12 5856; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm13 5857; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 5858; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 5859; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 5860; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 5861; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] 5862; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm8 5863; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 5864; AVX512DQ-NEXT: movb $56, %dil 5865; AVX512DQ-NEXT: kmovw %edi, %k2 5866; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} 5867; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 5868; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 5869; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm16 5870; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 5871; AVX512DQ-NEXT: movw $-2048, %di # imm = 0xF800 5872; AVX512DQ-NEXT: kmovw %edi, %k1 5873; AVX512DQ-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} 5874; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 5875; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 5876; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 5877; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} 5878; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} 5879; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 5880; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 5881; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm17 5882; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 5883; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] 5884; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm15 5885; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 5886; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} 5887; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 5888; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 5889; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm18 5890; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 5891; AVX512DQ-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} 5892; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 5893; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 5894; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 5895; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} 5896; AVX512DQ-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} 5897; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] 5898; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 5899; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 5900; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 5901; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 5902; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16 5903; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 5904; AVX512DQ-NEXT: movw $31, %di 5905; AVX512DQ-NEXT: kmovw %edi, %k2 5906; AVX512DQ-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} 5907; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 5908; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 5909; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm20 5910; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 5911; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} 5912; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 5913; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 5914; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 5915; AVX512DQ-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} 5916; AVX512DQ-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} 5917; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] 5918; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm21 5919; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 5920; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 5921; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 5922; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm18 5923; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 5924; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} 5925; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 5926; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 5927; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm22 5928; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 5929; AVX512DQ-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} 5930; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm6, %zmm21 5931; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 5932; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 5933; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} 5934; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} 5935; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 5936; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 5937; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm21 5938; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 5939; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] 5940; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm23 5941; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 5942; AVX512DQ-NEXT: movw $992, %di # imm = 0x3E0 5943; AVX512DQ-NEXT: kmovw %edi, %k1 5944; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} 5945; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 5946; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 5947; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm24 5948; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 5949; AVX512DQ-NEXT: movb $-32, %dil 5950; AVX512DQ-NEXT: kmovw %edi, %k2 5951; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm23 {%k2} 5952; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm21 5953; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm2, %zmm20 5954; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 5955; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} 5956; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} 5957; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 5958; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 5959; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 5960; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] 5961; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 5962; AVX512DQ-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} 5963; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 5964; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 5965; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 5966; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} 5967; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 5968; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 5969; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 5970; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} 5971; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} 5972; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rsi) 5973; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi) 5974; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%rdx) 5975; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rdx) 5976; AVX512DQ-NEXT: vmovdqa64 %zmm17, 64(%rcx) 5977; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rcx) 5978; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%r8) 5979; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%r8) 5980; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%r9) 5981; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%r9) 5982; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax) 5983; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) 5984; AVX512DQ-NEXT: vzeroupper 5985; AVX512DQ-NEXT: retq 5986; 5987; AVX512DQ-FCP-LABEL: load_i32_stride6_vf32: 5988; AVX512DQ-FCP: # %bb.0: 5989; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 5990; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 5991; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 5992; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 5993; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm5 5994; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 5995; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 5996; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm10 5997; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 5998; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 5999; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 6000; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12 6001; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 6002; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 6003; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 6004; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 6005; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 6006; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] 6007; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 6008; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 6009; AVX512DQ-FCP-NEXT: movb $56, %dil 6010; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 6011; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} 6012; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 6013; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 6014; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 6015; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 6016; AVX512DQ-FCP-NEXT: movw $-2048, %di # imm = 0xF800 6017; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 6018; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} 6019; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 6020; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 6021; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 6022; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} 6023; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} 6024; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 6025; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 6026; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 6027; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 6028; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] 6029; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 6030; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 6031; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} 6032; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 6033; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 6034; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 6035; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 6036; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} 6037; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 6038; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 6039; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 6040; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} 6041; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} 6042; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] 6043; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 6044; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 6045; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 6046; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 6047; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 6048; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 6049; AVX512DQ-FCP-NEXT: movw $31, %di 6050; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 6051; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} 6052; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 6053; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 6054; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm20 6055; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 6056; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} 6057; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 6058; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 6059; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 6060; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} 6061; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} 6062; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] 6063; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 6064; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 6065; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 6066; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 6067; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 6068; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 6069; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} 6070; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 6071; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 6072; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 6073; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 6074; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} 6075; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm21 6076; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 6077; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 6078; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} 6079; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} 6080; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 6081; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 6082; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 6083; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 6084; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] 6085; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 6086; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 6087; AVX512DQ-FCP-NEXT: movw $992, %di # imm = 0x3E0 6088; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 6089; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} 6090; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 6091; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 6092; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 6093; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 6094; AVX512DQ-FCP-NEXT: movb $-32, %dil 6095; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 6096; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k2} 6097; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm21 6098; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm20 6099; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 6100; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} 6101; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} 6102; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 6103; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 6104; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 6105; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] 6106; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 6107; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} 6108; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 6109; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 6110; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 6111; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} 6112; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 6113; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 6114; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 6115; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} 6116; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} 6117; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) 6118; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) 6119; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%rdx) 6120; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, (%rdx) 6121; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 64(%rcx) 6122; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) 6123; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%r8) 6124; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%r8) 6125; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%r9) 6126; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%r9) 6127; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) 6128; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) 6129; AVX512DQ-FCP-NEXT: vzeroupper 6130; AVX512DQ-FCP-NEXT: retq 6131; 6132; AVX512BW-LABEL: load_i32_stride6_vf32: 6133; AVX512BW: # %bb.0: 6134; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 6135; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 6136; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 6137; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 6138; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5 6139; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm6 6140; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 6141; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm10 6142; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9 6143; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11 6144; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 6145; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm12 6146; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm13 6147; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 6148; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 6149; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 6150; AVX512BW-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 6151; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] 6152; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 6153; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 6154; AVX512BW-NEXT: movb $56, %dil 6155; AVX512BW-NEXT: kmovd %edi, %k2 6156; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} 6157; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 6158; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 6159; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 6160; AVX512BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 6161; AVX512BW-NEXT: movw $-2048, %di # imm = 0xF800 6162; AVX512BW-NEXT: kmovd %edi, %k1 6163; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} 6164; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 6165; AVX512BW-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 6166; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 6167; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} 6168; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} 6169; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 6170; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 6171; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 6172; AVX512BW-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 6173; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] 6174; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 6175; AVX512BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 6176; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} 6177; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 6178; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 6179; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 6180; AVX512BW-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 6181; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} 6182; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 6183; AVX512BW-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 6184; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 6185; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} 6186; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} 6187; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] 6188; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 6189; AVX512BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 6190; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 6191; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 6192; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 6193; AVX512BW-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 6194; AVX512BW-NEXT: movw $31, %di 6195; AVX512BW-NEXT: kmovd %edi, %k2 6196; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} 6197; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 6198; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 6199; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm20 6200; AVX512BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 6201; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} 6202; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 6203; AVX512BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 6204; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 6205; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} 6206; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} 6207; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] 6208; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 6209; AVX512BW-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 6210; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 6211; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 6212; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm18 6213; AVX512BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 6214; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} 6215; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 6216; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 6217; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm22 6218; AVX512BW-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 6219; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} 6220; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm21 6221; AVX512BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 6222; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 6223; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} 6224; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} 6225; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 6226; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 6227; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 6228; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 6229; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] 6230; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 6231; AVX512BW-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 6232; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 6233; AVX512BW-NEXT: kmovd %edi, %k1 6234; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} 6235; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 6236; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 6237; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm24 6238; AVX512BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 6239; AVX512BW-NEXT: movb $-32, %dil 6240; AVX512BW-NEXT: kmovd %edi, %k2 6241; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 {%k2} 6242; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm21 6243; AVX512BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm20 6244; AVX512BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 6245; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} 6246; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} 6247; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 6248; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 6249; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 6250; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] 6251; AVX512BW-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 6252; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} 6253; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 6254; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 6255; AVX512BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 6256; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} 6257; AVX512BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 6258; AVX512BW-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 6259; AVX512BW-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 6260; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} 6261; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} 6262; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) 6263; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) 6264; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rdx) 6265; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rdx) 6266; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rcx) 6267; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rcx) 6268; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%r8) 6269; AVX512BW-NEXT: vmovdqa64 %zmm18, (%r8) 6270; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r9) 6271; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r9) 6272; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) 6273; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) 6274; AVX512BW-NEXT: vzeroupper 6275; AVX512BW-NEXT: retq 6276; 6277; AVX512BW-FCP-LABEL: load_i32_stride6_vf32: 6278; AVX512BW-FCP: # %bb.0: 6279; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6280; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 6281; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 6282; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 6283; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm5 6284; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 6285; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 6286; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm10 6287; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 6288; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 6289; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 6290; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12 6291; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 6292; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 6293; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 6294; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 6295; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 6296; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] 6297; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 6298; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 6299; AVX512BW-FCP-NEXT: movb $56, %dil 6300; AVX512BW-FCP-NEXT: kmovd %edi, %k2 6301; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} 6302; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 6303; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 6304; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 6305; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 6306; AVX512BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 6307; AVX512BW-FCP-NEXT: kmovd %edi, %k1 6308; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} 6309; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 6310; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 6311; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 6312; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} 6313; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} 6314; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 6315; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 6316; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 6317; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 6318; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] 6319; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 6320; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 6321; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} 6322; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 6323; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 6324; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 6325; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 6326; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} 6327; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 6328; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 6329; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 6330; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} 6331; AVX512BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} 6332; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] 6333; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 6334; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 6335; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 6336; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 6337; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 6338; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 6339; AVX512BW-FCP-NEXT: movw $31, %di 6340; AVX512BW-FCP-NEXT: kmovd %edi, %k2 6341; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} 6342; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 6343; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 6344; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm20 6345; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 6346; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} 6347; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 6348; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 6349; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 6350; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} 6351; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} 6352; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] 6353; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 6354; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 6355; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 6356; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 6357; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 6358; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 6359; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} 6360; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 6361; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 6362; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 6363; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 6364; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} 6365; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm21 6366; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 6367; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 6368; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} 6369; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} 6370; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 6371; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 6372; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 6373; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 6374; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] 6375; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 6376; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 6377; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 6378; AVX512BW-FCP-NEXT: kmovd %edi, %k1 6379; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} 6380; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 6381; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 6382; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 6383; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 6384; AVX512BW-FCP-NEXT: movb $-32, %dil 6385; AVX512BW-FCP-NEXT: kmovd %edi, %k2 6386; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k2} 6387; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm21 6388; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm20 6389; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 6390; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} 6391; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} 6392; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 6393; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 6394; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 6395; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] 6396; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 6397; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} 6398; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 6399; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 6400; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 6401; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} 6402; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 6403; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 6404; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 6405; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} 6406; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} 6407; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) 6408; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) 6409; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rdx) 6410; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rdx) 6411; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rcx) 6412; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) 6413; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%r8) 6414; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%r8) 6415; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r9) 6416; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, (%r9) 6417; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) 6418; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) 6419; AVX512BW-FCP-NEXT: vzeroupper 6420; AVX512BW-FCP-NEXT: retq 6421; 6422; AVX512DQ-BW-LABEL: load_i32_stride6_vf32: 6423; AVX512DQ-BW: # %bb.0: 6424; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 6425; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm0 6426; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm3 6427; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm2 6428; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm5 6429; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm6 6430; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm4 6431; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm10 6432; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm9 6433; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm11 6434; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 6435; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm12 6436; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm13 6437; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 6438; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 6439; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 6440; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 6441; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] 6442; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm8 6443; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 6444; AVX512DQ-BW-NEXT: movb $56, %dil 6445; AVX512DQ-BW-NEXT: kmovd %edi, %k2 6446; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} 6447; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 6448; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 6449; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm16 6450; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 6451; AVX512DQ-BW-NEXT: movw $-2048, %di # imm = 0xF800 6452; AVX512DQ-BW-NEXT: kmovd %edi, %k1 6453; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} 6454; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 6455; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 6456; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 6457; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} 6458; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} 6459; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 6460; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 6461; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm17 6462; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 6463; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] 6464; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm15 6465; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 6466; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} 6467; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 6468; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 6469; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm18 6470; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 6471; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} 6472; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 6473; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 6474; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 6475; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} 6476; AVX512DQ-BW-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} 6477; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] 6478; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 6479; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 6480; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 6481; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 6482; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16 6483; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 6484; AVX512DQ-BW-NEXT: movw $31, %di 6485; AVX512DQ-BW-NEXT: kmovd %edi, %k2 6486; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} 6487; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 6488; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 6489; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm20 6490; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 6491; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} 6492; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 6493; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 6494; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 6495; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} 6496; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} 6497; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] 6498; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm21 6499; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 6500; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 6501; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 6502; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm18 6503; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 6504; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} 6505; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 6506; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 6507; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm22 6508; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 6509; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} 6510; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm21 6511; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 6512; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 6513; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} 6514; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} 6515; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 6516; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 6517; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm21 6518; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 6519; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] 6520; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 6521; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 6522; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 6523; AVX512DQ-BW-NEXT: kmovd %edi, %k1 6524; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} 6525; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 6526; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 6527; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm24 6528; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 6529; AVX512DQ-BW-NEXT: movb $-32, %dil 6530; AVX512DQ-BW-NEXT: kmovd %edi, %k2 6531; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm23 {%k2} 6532; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm21 6533; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm20 6534; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 6535; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} 6536; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} 6537; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 6538; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 6539; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 6540; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] 6541; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 6542; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} 6543; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 6544; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 6545; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 6546; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} 6547; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 6548; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 6549; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 6550; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} 6551; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} 6552; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) 6553; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi) 6554; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rdx) 6555; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%rdx) 6556; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 64(%rcx) 6557; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rcx) 6558; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 64(%r8) 6559; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%r8) 6560; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%r9) 6561; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%r9) 6562; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) 6563; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) 6564; AVX512DQ-BW-NEXT: vzeroupper 6565; AVX512DQ-BW-NEXT: retq 6566; 6567; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf32: 6568; AVX512DQ-BW-FCP: # %bb.0: 6569; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6570; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 6571; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 6572; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 6573; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm5 6574; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 6575; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 6576; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm10 6577; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 6578; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 6579; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 6580; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12 6581; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 6582; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 6583; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 6584; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 6585; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 6586; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] 6587; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 6588; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 6589; AVX512DQ-BW-FCP-NEXT: movb $56, %dil 6590; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 6591; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} 6592; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 6593; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] 6594; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 6595; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 6596; AVX512DQ-BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 6597; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 6598; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} 6599; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 6600; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 6601; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 6602; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} 6603; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} 6604; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 6605; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 6606; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 6607; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 6608; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] 6609; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 6610; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 6611; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} 6612; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 6613; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 6614; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 6615; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 6616; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} 6617; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 6618; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 6619; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 6620; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} 6621; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} 6622; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] 6623; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 6624; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 6625; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 6626; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 6627; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 6628; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 6629; AVX512DQ-BW-FCP-NEXT: movw $31, %di 6630; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 6631; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} 6632; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 6633; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 6634; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm20 6635; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 6636; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} 6637; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 6638; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 6639; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 6640; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} 6641; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} 6642; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] 6643; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 6644; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 6645; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 6646; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 6647; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 6648; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 6649; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} 6650; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 6651; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 6652; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 6653; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 6654; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} 6655; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm21 6656; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 6657; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 6658; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} 6659; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} 6660; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 6661; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 6662; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 6663; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 6664; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] 6665; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 6666; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 6667; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 6668; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 6669; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} 6670; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 6671; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 6672; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 6673; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 6674; AVX512DQ-BW-FCP-NEXT: movb $-32, %dil 6675; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 6676; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k2} 6677; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm21 6678; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm20 6679; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 6680; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} 6681; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} 6682; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 6683; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 6684; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 6685; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] 6686; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 6687; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} 6688; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 6689; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 6690; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 6691; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} 6692; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 6693; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 6694; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 6695; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} 6696; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} 6697; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) 6698; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) 6699; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rdx) 6700; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rdx) 6701; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rcx) 6702; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) 6703; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%r8) 6704; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%r8) 6705; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r9) 6706; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, (%r9) 6707; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) 6708; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) 6709; AVX512DQ-BW-FCP-NEXT: vzeroupper 6710; AVX512DQ-BW-FCP-NEXT: retq 6711 %wide.vec = load <192 x i32>, ptr %in.vec, align 64 6712 %strided.vec0 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <32 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186> 6713 %strided.vec1 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <32 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91, i32 97, i32 103, i32 109, i32 115, i32 121, i32 127, i32 133, i32 139, i32 145, i32 151, i32 157, i32 163, i32 169, i32 175, i32 181, i32 187> 6714 %strided.vec2 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <32 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92, i32 98, i32 104, i32 110, i32 116, i32 122, i32 128, i32 134, i32 140, i32 146, i32 152, i32 158, i32 164, i32 170, i32 176, i32 182, i32 188> 6715 %strided.vec3 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <32 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93, i32 99, i32 105, i32 111, i32 117, i32 123, i32 129, i32 135, i32 141, i32 147, i32 153, i32 159, i32 165, i32 171, i32 177, i32 183, i32 189> 6716 %strided.vec4 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <32 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94, i32 100, i32 106, i32 112, i32 118, i32 124, i32 130, i32 136, i32 142, i32 148, i32 154, i32 160, i32 166, i32 172, i32 178, i32 184, i32 190> 6717 %strided.vec5 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <32 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95, i32 101, i32 107, i32 113, i32 119, i32 125, i32 131, i32 137, i32 143, i32 149, i32 155, i32 161, i32 167, i32 173, i32 179, i32 185, i32 191> 6718 store <32 x i32> %strided.vec0, ptr %out.vec0, align 64 6719 store <32 x i32> %strided.vec1, ptr %out.vec1, align 64 6720 store <32 x i32> %strided.vec2, ptr %out.vec2, align 64 6721 store <32 x i32> %strided.vec3, ptr %out.vec3, align 64 6722 store <32 x i32> %strided.vec4, ptr %out.vec4, align 64 6723 store <32 x i32> %strided.vec5, ptr %out.vec5, align 64 6724 ret void 6725} 6726 6727define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 6728; SSE-LABEL: load_i32_stride6_vf64: 6729; SSE: # %bb.0: 6730; SSE-NEXT: subq $2184, %rsp # imm = 0x888 6731; SSE-NEXT: movdqa 912(%rdi), %xmm7 6732; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6733; SSE-NEXT: movdqa 928(%rdi), %xmm3 6734; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6735; SSE-NEXT: movdqa 864(%rdi), %xmm8 6736; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6737; SSE-NEXT: movdqa 880(%rdi), %xmm4 6738; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6739; SSE-NEXT: movdqa 528(%rdi), %xmm9 6740; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6741; SSE-NEXT: movdqa 544(%rdi), %xmm5 6742; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6743; SSE-NEXT: movdqa 480(%rdi), %xmm10 6744; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6745; SSE-NEXT: movdqa 496(%rdi), %xmm6 6746; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6747; SSE-NEXT: movdqa 144(%rdi), %xmm11 6748; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6749; SSE-NEXT: movdqa 160(%rdi), %xmm2 6750; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6751; SSE-NEXT: movdqa 96(%rdi), %xmm1 6752; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill 6753; SSE-NEXT: movdqa 112(%rdi), %xmm0 6754; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6755; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 6756; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6757; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] 6758; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,0,1,1] 6759; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 6760; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 6761; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6762; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] 6763; SSE-NEXT: movdqa %xmm10, %xmm1 6764; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6765; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] 6766; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,1,1] 6767; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 6768; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 6769; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6770; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] 6771; SSE-NEXT: movdqa %xmm8, %xmm1 6772; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6773; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] 6774; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] 6775; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 6776; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 6777; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6778; SSE-NEXT: movdqa 1248(%rdi), %xmm1 6779; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6780; SSE-NEXT: movdqa 1264(%rdi), %xmm0 6781; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6782; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 6783; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6784; SSE-NEXT: movdqa 1296(%rdi), %xmm3 6785; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6786; SSE-NEXT: movdqa 1312(%rdi), %xmm0 6787; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6788; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 6789; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] 6790; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 6791; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 6792; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6793; SSE-NEXT: movdqa (%rdi), %xmm1 6794; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6795; SSE-NEXT: movdqa 16(%rdi), %xmm0 6796; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6797; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 6798; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6799; SSE-NEXT: movdqa 64(%rdi), %xmm0 6800; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6801; SSE-NEXT: movdqa 48(%rdi), %xmm2 6802; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6803; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 6804; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] 6805; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 6806; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] 6807; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6808; SSE-NEXT: movdqa 384(%rdi), %xmm1 6809; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6810; SSE-NEXT: movdqa 400(%rdi), %xmm0 6811; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6812; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 6813; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6814; SSE-NEXT: movdqa 432(%rdi), %xmm2 6815; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6816; SSE-NEXT: movdqa 448(%rdi), %xmm0 6817; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6818; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 6819; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] 6820; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 6821; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] 6822; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6823; SSE-NEXT: movdqa 768(%rdi), %xmm1 6824; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6825; SSE-NEXT: movdqa 784(%rdi), %xmm0 6826; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6827; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 6828; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6829; SSE-NEXT: movdqa 816(%rdi), %xmm2 6830; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6831; SSE-NEXT: movdqa 832(%rdi), %xmm0 6832; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6833; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 6834; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] 6835; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 6836; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] 6837; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6838; SSE-NEXT: movdqa 1152(%rdi), %xmm1 6839; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6840; SSE-NEXT: movdqa 1168(%rdi), %xmm0 6841; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6842; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 6843; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6844; SSE-NEXT: movdqa 1200(%rdi), %xmm2 6845; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6846; SSE-NEXT: movdqa 1216(%rdi), %xmm0 6847; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6848; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 6849; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] 6850; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 6851; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] 6852; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6853; SSE-NEXT: movdqa 288(%rdi), %xmm2 6854; SSE-NEXT: movdqa 304(%rdi), %xmm0 6855; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6856; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 6857; SSE-NEXT: movdqa %xmm2, %xmm1 6858; SSE-NEXT: movdqa %xmm2, %xmm15 6859; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6860; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6861; SSE-NEXT: movdqa 336(%rdi), %xmm7 6862; SSE-NEXT: movdqa 352(%rdi), %xmm0 6863; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6864; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 6865; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] 6866; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6867; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 6868; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] 6869; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6870; SSE-NEXT: movdqa 672(%rdi), %xmm1 6871; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6872; SSE-NEXT: movdqa 688(%rdi), %xmm0 6873; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6874; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 6875; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6876; SSE-NEXT: movdqa 720(%rdi), %xmm3 6877; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6878; SSE-NEXT: movdqa 736(%rdi), %xmm0 6879; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6880; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 6881; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] 6882; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 6883; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] 6884; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6885; SSE-NEXT: movdqa 1056(%rdi), %xmm1 6886; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6887; SSE-NEXT: movdqa 1072(%rdi), %xmm0 6888; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6889; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 6890; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6891; SSE-NEXT: movdqa 1104(%rdi), %xmm2 6892; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6893; SSE-NEXT: movdqa 1120(%rdi), %xmm0 6894; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6895; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 6896; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] 6897; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 6898; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] 6899; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6900; SSE-NEXT: movdqa 1440(%rdi), %xmm1 6901; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6902; SSE-NEXT: movdqa 1456(%rdi), %xmm0 6903; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6904; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 6905; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6906; SSE-NEXT: movdqa 1488(%rdi), %xmm2 6907; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6908; SSE-NEXT: movdqa 1504(%rdi), %xmm0 6909; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6910; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 6911; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] 6912; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 6913; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] 6914; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6915; SSE-NEXT: movdqa 192(%rdi), %xmm5 6916; SSE-NEXT: movdqa 208(%rdi), %xmm6 6917; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] 6918; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6919; SSE-NEXT: movdqa %xmm5, %xmm3 6920; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6921; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 6922; SSE-NEXT: movdqa 240(%rdi), %xmm2 6923; SSE-NEXT: movdqa 256(%rdi), %xmm1 6924; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] 6925; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6926; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,0,1,1] 6927; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6928; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] 6929; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] 6930; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6931; SSE-NEXT: movdqa 576(%rdi), %xmm10 6932; SSE-NEXT: movdqa 592(%rdi), %xmm14 6933; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] 6934; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6935; SSE-NEXT: movdqa %xmm10, %xmm4 6936; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6937; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 6938; SSE-NEXT: movdqa 624(%rdi), %xmm11 6939; SSE-NEXT: movdqa 640(%rdi), %xmm13 6940; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,2,3,3] 6941; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6942; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,1,1] 6943; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6944; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] 6945; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] 6946; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6947; SSE-NEXT: movdqa 960(%rdi), %xmm4 6948; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6949; SSE-NEXT: movdqa 976(%rdi), %xmm0 6950; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6951; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 6952; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 6953; SSE-NEXT: movdqa 1008(%rdi), %xmm8 6954; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6955; SSE-NEXT: movdqa 1024(%rdi), %xmm0 6956; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] 6957; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6958; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,1,1] 6959; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] 6960; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] 6961; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6962; SSE-NEXT: movdqa 1344(%rdi), %xmm4 6963; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6964; SSE-NEXT: movdqa 1360(%rdi), %xmm3 6965; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6966; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 6967; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 6968; SSE-NEXT: movdqa 1392(%rdi), %xmm9 6969; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6970; SSE-NEXT: movdqa 1408(%rdi), %xmm3 6971; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6972; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 6973; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,0,1,1] 6974; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] 6975; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] 6976; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6977; SSE-NEXT: pshufd $85, (%rsp), %xmm3 # 16-byte Folded Reload 6978; SSE-NEXT: # xmm3 = mem[1,1,1,1] 6979; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 6980; SSE-NEXT: # xmm4 = mem[3,3,3,3] 6981; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 6982; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 6983; SSE-NEXT: # xmm4 = mem[2,3,2,3] 6984; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 6985; SSE-NEXT: movdqa %xmm9, %xmm12 6986; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] 6987; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] 6988; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6989; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 6990; SSE-NEXT: # xmm3 = mem[1,1,1,1] 6991; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 6992; SSE-NEXT: # xmm4 = mem[3,3,3,3] 6993; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 6994; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 6995; SSE-NEXT: # xmm4 = mem[2,3,2,3] 6996; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 6997; SSE-NEXT: movdqa %xmm8, %xmm12 6998; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] 6999; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] 7000; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7001; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] 7002; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7003; SSE-NEXT: # xmm4 = mem[3,3,3,3] 7004; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 7005; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7006; SSE-NEXT: # xmm4 = mem[2,3,2,3] 7007; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] 7008; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] 7009; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7010; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] 7011; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,3,3,3] 7012; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 7013; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] 7014; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 7015; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] 7016; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7017; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 7018; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] 7019; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7020; SSE-NEXT: # xmm4 = mem[3,3,3,3] 7021; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 7022; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7023; SSE-NEXT: # xmm4 = mem[2,3,2,3] 7024; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 7025; SSE-NEXT: movdqa %xmm12, %xmm7 7026; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] 7027; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] 7028; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7029; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 7030; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] 7031; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7032; SSE-NEXT: # xmm4 = mem[3,3,3,3] 7033; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 7034; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7035; SSE-NEXT: # xmm4 = mem[2,3,2,3] 7036; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7037; SSE-NEXT: movdqa %xmm2, %xmm5 7038; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 7039; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] 7040; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7041; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7042; SSE-NEXT: # xmm3 = mem[1,1,1,1] 7043; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7044; SSE-NEXT: # xmm4 = mem[3,3,3,3] 7045; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 7046; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7047; SSE-NEXT: # xmm4 = mem[2,3,2,3] 7048; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7049; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 7050; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] 7051; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7052; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] 7053; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,3,3,3] 7054; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 7055; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,3,2,3] 7056; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] 7057; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] 7058; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7059; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7060; SSE-NEXT: # xmm3 = mem[1,1,1,1] 7061; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7062; SSE-NEXT: # xmm4 = mem[3,3,3,3] 7063; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 7064; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7065; SSE-NEXT: # xmm4 = mem[2,3,2,3] 7066; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 7067; SSE-NEXT: movdqa %xmm11, %xmm1 7068; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 7069; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] 7070; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7071; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 7072; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] 7073; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7074; SSE-NEXT: # xmm4 = mem[3,3,3,3] 7075; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 7076; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7077; SSE-NEXT: # xmm4 = mem[2,3,2,3] 7078; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7079; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 7080; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] 7081; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7082; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7083; SSE-NEXT: # xmm3 = mem[1,1,1,1] 7084; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7085; SSE-NEXT: # xmm4 = mem[3,3,3,3] 7086; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 7087; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7088; SSE-NEXT: # xmm4 = mem[2,3,2,3] 7089; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 7090; SSE-NEXT: movdqa %xmm13, %xmm1 7091; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 7092; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] 7093; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7094; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7095; SSE-NEXT: # xmm3 = mem[1,1,1,1] 7096; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7097; SSE-NEXT: # xmm4 = mem[3,3,3,3] 7098; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 7099; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 7100; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7101; SSE-NEXT: movdqa %xmm14, %xmm0 7102; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 7103; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7104; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7105; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7106; SSE-NEXT: # xmm3 = mem[1,1,1,1] 7107; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7108; SSE-NEXT: # xmm4 = mem[3,3,3,3] 7109; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 7110; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7111; SSE-NEXT: # xmm4 = mem[2,3,2,3] 7112; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7113; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 7114; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7115; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7116; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 7117; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] 7118; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7119; SSE-NEXT: # xmm4 = mem[3,3,3,3] 7120; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 7121; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7122; SSE-NEXT: # xmm4 = mem[2,3,2,3] 7123; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7124; SSE-NEXT: movdqa %xmm5, %xmm0 7125; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 7126; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7127; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7128; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7129; SSE-NEXT: # xmm3 = mem[1,1,1,1] 7130; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7131; SSE-NEXT: # xmm4 = mem[3,3,3,3] 7132; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 7133; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7134; SSE-NEXT: # xmm4 = mem[2,3,2,3] 7135; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7136; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 7137; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7138; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7139; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7140; SSE-NEXT: # xmm3 = mem[1,1,1,1] 7141; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7142; SSE-NEXT: # xmm4 = mem[3,3,3,3] 7143; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 7144; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7145; SSE-NEXT: # xmm4 = mem[2,3,2,3] 7146; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7147; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 7148; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7149; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7150; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] 7151; SSE-NEXT: movdqa 80(%rdi), %xmm1 7152; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7153; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 7154; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 7155; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7156; SSE-NEXT: # xmm3 = mem[2,3,2,3] 7157; SSE-NEXT: movdqa 32(%rdi), %xmm1 7158; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7159; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 7160; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7161; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7162; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] 7163; SSE-NEXT: movdqa 176(%rdi), %xmm1 7164; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7165; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 7166; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 7167; SSE-NEXT: pshufd $238, (%rsp), %xmm3 # 16-byte Folded Reload 7168; SSE-NEXT: # xmm3 = mem[2,3,2,3] 7169; SSE-NEXT: movdqa 128(%rdi), %xmm1 7170; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7171; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 7172; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7173; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7174; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7175; SSE-NEXT: # xmm0 = mem[2,2,3,3] 7176; SSE-NEXT: movdqa 272(%rdi), %xmm1 7177; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7178; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 7179; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 7180; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7181; SSE-NEXT: # xmm3 = mem[2,3,2,3] 7182; SSE-NEXT: movdqa 224(%rdi), %xmm1 7183; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7184; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 7185; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7186; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7187; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7188; SSE-NEXT: # xmm0 = mem[2,2,3,3] 7189; SSE-NEXT: movdqa 368(%rdi), %xmm1 7190; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7191; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 7192; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 7193; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7194; SSE-NEXT: # xmm3 = mem[2,3,2,3] 7195; SSE-NEXT: movdqa 320(%rdi), %xmm1 7196; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7197; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 7198; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7199; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7200; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] 7201; SSE-NEXT: movdqa 464(%rdi), %xmm1 7202; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7203; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 7204; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 7205; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] 7206; SSE-NEXT: movdqa 416(%rdi), %xmm1 7207; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7208; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 7209; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7210; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7211; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] 7212; SSE-NEXT: movdqa 560(%rdi), %xmm1 7213; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7214; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 7215; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 7216; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3] 7217; SSE-NEXT: movdqa 512(%rdi), %xmm2 7218; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7219; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 7220; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7221; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7222; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7223; SSE-NEXT: # xmm0 = mem[2,2,3,3] 7224; SSE-NEXT: movdqa 656(%rdi), %xmm1 7225; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7226; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 7227; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 7228; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7229; SSE-NEXT: # xmm3 = mem[2,3,2,3] 7230; SSE-NEXT: movdqa 608(%rdi), %xmm9 7231; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] 7232; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7233; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7234; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7235; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7236; SSE-NEXT: # xmm0 = mem[2,2,3,3] 7237; SSE-NEXT: movdqa 752(%rdi), %xmm1 7238; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7239; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 7240; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 7241; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 7242; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] 7243; SSE-NEXT: movdqa 704(%rdi), %xmm2 7244; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7245; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 7246; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7247; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7248; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7249; SSE-NEXT: # xmm0 = mem[2,2,3,3] 7250; SSE-NEXT: movdqa 848(%rdi), %xmm1 7251; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7252; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 7253; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 7254; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] 7255; SSE-NEXT: movdqa 800(%rdi), %xmm2 7256; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7257; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 7258; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7259; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7260; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] 7261; SSE-NEXT: movdqa 944(%rdi), %xmm1 7262; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7263; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 7264; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 7265; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7266; SSE-NEXT: # xmm3 = mem[2,3,2,3] 7267; SSE-NEXT: movdqa 896(%rdi), %xmm8 7268; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] 7269; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7270; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7271; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7272; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] 7273; SSE-NEXT: movdqa 1040(%rdi), %xmm1 7274; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7275; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 7276; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 7277; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 7278; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] 7279; SSE-NEXT: movdqa 992(%rdi), %xmm1 7280; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7281; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 7282; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7283; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7284; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] 7285; SSE-NEXT: movdqa 1136(%rdi), %xmm1 7286; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7287; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 7288; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 7289; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7290; SSE-NEXT: # xmm3 = mem[2,3,2,3] 7291; SSE-NEXT: movdqa 1088(%rdi), %xmm6 7292; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] 7293; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7294; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7295; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7296; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] 7297; SSE-NEXT: movdqa 1232(%rdi), %xmm2 7298; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7299; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] 7300; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 7301; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] 7302; SSE-NEXT: movdqa 1184(%rdi), %xmm1 7303; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7304; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 7305; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7306; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7307; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7308; SSE-NEXT: # xmm0 = mem[2,2,3,3] 7309; SSE-NEXT: movdqa 1328(%rdi), %xmm1 7310; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7311; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 7312; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 7313; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7314; SSE-NEXT: # xmm3 = mem[2,3,2,3] 7315; SSE-NEXT: movdqa 1280(%rdi), %xmm1 7316; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7317; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 7318; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7319; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7320; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7321; SSE-NEXT: # xmm0 = mem[2,2,3,3] 7322; SSE-NEXT: movdqa 1424(%rdi), %xmm1 7323; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7324; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 7325; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 7326; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7327; SSE-NEXT: # xmm3 = mem[2,3,2,3] 7328; SSE-NEXT: movdqa 1376(%rdi), %xmm1 7329; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7330; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 7331; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7332; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7333; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7334; SSE-NEXT: # xmm0 = mem[2,2,3,3] 7335; SSE-NEXT: movdqa 1520(%rdi), %xmm1 7336; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7337; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 7338; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 7339; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7340; SSE-NEXT: # xmm3 = mem[2,3,2,3] 7341; SSE-NEXT: movdqa 1472(%rdi), %xmm1 7342; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7343; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 7344; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7345; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7346; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7347; SSE-NEXT: # xmm3 = mem[3,3,3,3] 7348; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7349; SSE-NEXT: # xmm4 = mem[1,1,1,1] 7350; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 7351; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7352; SSE-NEXT: # xmm0 = mem[2,3,2,3] 7353; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7354; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 7355; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 7356; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7357; SSE-NEXT: pshufd $255, (%rsp), %xmm2 # 16-byte Folded Reload 7358; SSE-NEXT: # xmm2 = mem[3,3,3,3] 7359; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7360; SSE-NEXT: # xmm3 = mem[1,1,1,1] 7361; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 7362; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7363; SSE-NEXT: # xmm0 = mem[2,3,2,3] 7364; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7365; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 7366; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 7367; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill 7368; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7369; SSE-NEXT: # xmm0 = mem[3,3,3,3] 7370; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7371; SSE-NEXT: # xmm2 = mem[1,1,1,1] 7372; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 7373; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7374; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7375; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7376; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 7377; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 7378; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7379; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7380; SSE-NEXT: # xmm0 = mem[3,3,3,3] 7381; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7382; SSE-NEXT: # xmm1 = mem[1,1,1,1] 7383; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7384; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7385; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7386; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7387; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 7388; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 7389; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7390; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7391; SSE-NEXT: # xmm0 = mem[3,3,3,3] 7392; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7393; SSE-NEXT: # xmm1 = mem[1,1,1,1] 7394; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7395; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7396; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7397; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 7398; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] 7399; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 7400; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7401; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7402; SSE-NEXT: # xmm0 = mem[3,3,3,3] 7403; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7404; SSE-NEXT: # xmm1 = mem[1,1,1,1] 7405; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7406; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7407; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7408; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7409; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 7410; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 7411; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7412; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7413; SSE-NEXT: # xmm0 = mem[3,3,3,3] 7414; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] 7415; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7416; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7417; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7418; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7419; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] 7420; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 7421; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7422; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] 7423; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7424; SSE-NEXT: # xmm1 = mem[1,1,1,1] 7425; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7426; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7427; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7428; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 7429; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] 7430; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 7431; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7432; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7433; SSE-NEXT: # xmm0 = mem[3,3,3,3] 7434; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7435; SSE-NEXT: # xmm1 = mem[1,1,1,1] 7436; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7437; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7438; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7439; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 7440; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] 7441; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 7442; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7443; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7444; SSE-NEXT: # xmm0 = mem[3,3,3,3] 7445; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] 7446; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7447; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7448; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7449; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 7450; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] 7451; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 7452; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7453; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] 7454; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 7455; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] 7456; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7457; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7458; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7459; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 7460; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] 7461; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 7462; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7463; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7464; SSE-NEXT: # xmm0 = mem[3,3,3,3] 7465; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] 7466; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7467; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7468; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7469; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 7470; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] 7471; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 7472; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7473; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] 7474; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 7475; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] 7476; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7477; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7478; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7479; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7480; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 7481; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 7482; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7483; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7484; SSE-NEXT: # xmm0 = mem[3,3,3,3] 7485; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7486; SSE-NEXT: # xmm1 = mem[1,1,1,1] 7487; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7488; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7489; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7490; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7491; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] 7492; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 7493; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7494; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7495; SSE-NEXT: # xmm0 = mem[3,3,3,3] 7496; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 7497; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] 7498; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7499; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7500; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7501; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7502; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 7503; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 7504; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7505; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7506; SSE-NEXT: # xmm0 = mem[3,3,3,3] 7507; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7508; SSE-NEXT: # xmm1 = mem[1,1,1,1] 7509; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7510; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7511; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7512; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 7513; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 7514; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 7515; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7516; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7517; SSE-NEXT: # xmm0 = mem[2,3,2,3] 7518; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7519; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7520; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7521; SSE-NEXT: # xmm0 = mem[2,2,3,3] 7522; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7523; SSE-NEXT: # xmm2 = mem[0,0,1,1] 7524; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 7525; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 7526; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7527; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7528; SSE-NEXT: # xmm0 = mem[2,3,2,3] 7529; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7530; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7531; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7532; SSE-NEXT: # xmm3 = mem[2,2,3,3] 7533; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7534; SSE-NEXT: # xmm2 = mem[0,0,1,1] 7535; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 7536; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 7537; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7538; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7539; SSE-NEXT: # xmm0 = mem[2,3,2,3] 7540; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7541; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7542; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7543; SSE-NEXT: # xmm0 = mem[2,2,3,3] 7544; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7545; SSE-NEXT: # xmm2 = mem[0,0,1,1] 7546; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 7547; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 7548; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7549; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7550; SSE-NEXT: # xmm0 = mem[2,3,2,3] 7551; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7552; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7553; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7554; SSE-NEXT: # xmm0 = mem[2,2,3,3] 7555; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7556; SSE-NEXT: # xmm2 = mem[0,0,1,1] 7557; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 7558; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 7559; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7560; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7561; SSE-NEXT: # xmm0 = mem[2,3,2,3] 7562; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7563; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7564; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] 7565; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 7566; SSE-NEXT: # xmm15 = mem[0,0,1,1] 7567; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] 7568; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] 7569; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7570; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7571; SSE-NEXT: # xmm0 = mem[2,3,2,3] 7572; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7573; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7574; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7575; SSE-NEXT: # xmm0 = mem[2,2,3,3] 7576; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7577; SSE-NEXT: # xmm2 = mem[0,0,1,1] 7578; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 7579; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 7580; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7581; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7582; SSE-NEXT: # xmm0 = mem[2,3,2,3] 7583; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7584; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7585; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] 7586; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7587; SSE-NEXT: # xmm2 = mem[0,0,1,1] 7588; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 7589; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 7590; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7591; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7592; SSE-NEXT: # xmm0 = mem[2,3,2,3] 7593; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7594; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7595; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] 7596; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7597; SSE-NEXT: # xmm2 = mem[0,0,1,1] 7598; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 7599; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 7600; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7601; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7602; SSE-NEXT: # xmm0 = mem[2,3,2,3] 7603; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7604; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7605; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] 7606; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 7607; SSE-NEXT: # xmm11 = mem[0,0,1,1] 7608; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] 7609; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] 7610; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7611; SSE-NEXT: # xmm0 = mem[2,3,2,3] 7612; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7613; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7614; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] 7615; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 7616; SSE-NEXT: # xmm9 = mem[0,0,1,1] 7617; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] 7618; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] 7619; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] 7620; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7621; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7622; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] 7623; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 7624; SSE-NEXT: # xmm8 = mem[0,0,1,1] 7625; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] 7626; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] 7627; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 7628; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] 7629; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7630; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7631; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] 7632; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 7633; SSE-NEXT: # xmm13 = mem[0,0,1,1] 7634; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] 7635; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] 7636; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] 7637; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7638; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7639; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7640; SSE-NEXT: # xmm0 = mem[2,2,3,3] 7641; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 7642; SSE-NEXT: # xmm7 = mem[0,0,1,1] 7643; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] 7644; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] 7645; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 7646; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] 7647; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7648; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7649; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] 7650; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 7651; SSE-NEXT: # xmm5 = mem[0,0,1,1] 7652; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 7653; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] 7654; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] 7655; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7656; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7657; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7658; SSE-NEXT: # xmm1 = mem[2,2,3,3] 7659; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7660; SSE-NEXT: # xmm3 = mem[0,0,1,1] 7661; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 7662; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] 7663; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 7664; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] 7665; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7666; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7667; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] 7668; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7669; SSE-NEXT: # xmm2 = mem[0,0,1,1] 7670; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 7671; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 7672; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7673; SSE-NEXT: # xmm0 = mem[1,1,1,1] 7674; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7675; SSE-NEXT: # xmm1 = mem[3,3,3,3] 7676; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7677; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7678; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7679; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 7680; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] 7681; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] 7682; SSE-NEXT: movapd %xmm15, %xmm4 7683; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7684; SSE-NEXT: # xmm0 = mem[1,1,1,1] 7685; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7686; SSE-NEXT: # xmm1 = mem[3,3,3,3] 7687; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7688; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7689; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7690; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 7691; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] 7692; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] 7693; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7694; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7695; SSE-NEXT: # xmm0 = mem[1,1,1,1] 7696; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7697; SSE-NEXT: # xmm1 = mem[3,3,3,3] 7698; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7699; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7700; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7701; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 7702; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] 7703; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] 7704; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7705; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7706; SSE-NEXT: # xmm0 = mem[1,1,1,1] 7707; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7708; SSE-NEXT: # xmm1 = mem[3,3,3,3] 7709; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7710; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7711; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7712; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 7713; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] 7714; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] 7715; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7716; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7717; SSE-NEXT: # xmm0 = mem[1,1,1,1] 7718; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7719; SSE-NEXT: # xmm1 = mem[3,3,3,3] 7720; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7721; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7722; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7723; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 7724; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] 7725; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] 7726; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7727; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7728; SSE-NEXT: # xmm0 = mem[1,1,1,1] 7729; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7730; SSE-NEXT: # xmm1 = mem[3,3,3,3] 7731; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7732; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7733; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7734; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 7735; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] 7736; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] 7737; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7738; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7739; SSE-NEXT: # xmm0 = mem[1,1,1,1] 7740; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7741; SSE-NEXT: # xmm1 = mem[3,3,3,3] 7742; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7743; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7744; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7745; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7746; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] 7747; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] 7748; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7749; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7750; SSE-NEXT: # xmm0 = mem[1,1,1,1] 7751; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7752; SSE-NEXT: # xmm1 = mem[3,3,3,3] 7753; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7754; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7755; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7756; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7757; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] 7758; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] 7759; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7760; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7761; SSE-NEXT: # xmm0 = mem[1,1,1,1] 7762; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7763; SSE-NEXT: # xmm1 = mem[3,3,3,3] 7764; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7765; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7766; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7767; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7768; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] 7769; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] 7770; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7771; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7772; SSE-NEXT: # xmm0 = mem[1,1,1,1] 7773; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7774; SSE-NEXT: # xmm1 = mem[3,3,3,3] 7775; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7776; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7777; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7778; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7779; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] 7780; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] 7781; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7782; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7783; SSE-NEXT: # xmm0 = mem[1,1,1,1] 7784; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7785; SSE-NEXT: # xmm1 = mem[3,3,3,3] 7786; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7787; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7788; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7789; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7790; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] 7791; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] 7792; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7793; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7794; SSE-NEXT: # xmm0 = mem[1,1,1,1] 7795; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] 7796; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7797; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7798; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7799; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 7800; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] 7801; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] 7802; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7803; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7804; SSE-NEXT: # xmm0 = mem[1,1,1,1] 7805; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7806; SSE-NEXT: # xmm1 = mem[3,3,3,3] 7807; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7808; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7809; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7810; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 7811; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] 7812; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] 7813; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7814; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7815; SSE-NEXT: # xmm0 = mem[1,1,1,1] 7816; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] 7817; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7818; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7819; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7820; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 7821; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] 7822; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] 7823; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7824; SSE-NEXT: # xmm0 = mem[1,1,1,1] 7825; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7826; SSE-NEXT: # xmm1 = mem[3,3,3,3] 7827; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7828; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7829; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7830; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 7831; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] 7832; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] 7833; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7834; SSE-NEXT: # xmm0 = mem[1,1,1,1] 7835; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] 7836; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 7837; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7838; SSE-NEXT: # xmm1 = mem[2,3,2,3] 7839; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7840; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] 7841; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] 7842; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7843; SSE-NEXT: movaps %xmm0, 224(%rsi) 7844; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7845; SSE-NEXT: movaps %xmm0, 160(%rsi) 7846; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7847; SSE-NEXT: movaps %xmm0, 96(%rsi) 7848; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7849; SSE-NEXT: movaps %xmm0, 32(%rsi) 7850; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7851; SSE-NEXT: movaps %xmm0, 240(%rsi) 7852; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7853; SSE-NEXT: movaps %xmm0, 176(%rsi) 7854; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7855; SSE-NEXT: movaps %xmm0, 112(%rsi) 7856; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7857; SSE-NEXT: movaps %xmm0, 48(%rsi) 7858; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7859; SSE-NEXT: movaps %xmm0, 192(%rsi) 7860; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7861; SSE-NEXT: movaps %xmm0, 128(%rsi) 7862; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7863; SSE-NEXT: movaps %xmm0, 64(%rsi) 7864; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7865; SSE-NEXT: movaps %xmm0, (%rsi) 7866; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7867; SSE-NEXT: movaps %xmm0, 208(%rsi) 7868; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7869; SSE-NEXT: movaps %xmm0, 144(%rsi) 7870; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7871; SSE-NEXT: movaps %xmm0, 80(%rsi) 7872; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7873; SSE-NEXT: movaps %xmm0, 16(%rsi) 7874; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7875; SSE-NEXT: movaps %xmm0, 224(%rdx) 7876; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7877; SSE-NEXT: movaps %xmm0, 240(%rdx) 7878; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7879; SSE-NEXT: movaps %xmm0, 192(%rdx) 7880; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7881; SSE-NEXT: movaps %xmm0, 208(%rdx) 7882; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7883; SSE-NEXT: movaps %xmm0, 160(%rdx) 7884; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7885; SSE-NEXT: movaps %xmm0, 176(%rdx) 7886; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7887; SSE-NEXT: movaps %xmm0, 128(%rdx) 7888; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7889; SSE-NEXT: movaps %xmm0, 144(%rdx) 7890; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7891; SSE-NEXT: movaps %xmm0, 96(%rdx) 7892; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7893; SSE-NEXT: movaps %xmm0, 112(%rdx) 7894; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7895; SSE-NEXT: movaps %xmm0, 64(%rdx) 7896; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7897; SSE-NEXT: movaps %xmm0, 80(%rdx) 7898; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7899; SSE-NEXT: movaps %xmm0, 32(%rdx) 7900; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7901; SSE-NEXT: movaps %xmm0, 48(%rdx) 7902; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7903; SSE-NEXT: movaps %xmm0, (%rdx) 7904; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7905; SSE-NEXT: movaps %xmm0, 16(%rdx) 7906; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7907; SSE-NEXT: movaps %xmm0, 240(%rcx) 7908; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7909; SSE-NEXT: movaps %xmm0, 224(%rcx) 7910; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7911; SSE-NEXT: movaps %xmm0, 208(%rcx) 7912; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7913; SSE-NEXT: movaps %xmm0, 192(%rcx) 7914; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7915; SSE-NEXT: movaps %xmm0, 176(%rcx) 7916; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7917; SSE-NEXT: movaps %xmm0, 160(%rcx) 7918; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7919; SSE-NEXT: movaps %xmm0, 144(%rcx) 7920; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7921; SSE-NEXT: movaps %xmm0, 128(%rcx) 7922; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7923; SSE-NEXT: movaps %xmm0, 112(%rcx) 7924; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7925; SSE-NEXT: movaps %xmm0, 96(%rcx) 7926; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7927; SSE-NEXT: movaps %xmm0, 80(%rcx) 7928; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7929; SSE-NEXT: movaps %xmm0, 64(%rcx) 7930; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7931; SSE-NEXT: movaps %xmm0, 48(%rcx) 7932; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7933; SSE-NEXT: movaps %xmm0, 32(%rcx) 7934; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7935; SSE-NEXT: movaps %xmm0, 16(%rcx) 7936; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7937; SSE-NEXT: movaps %xmm0, (%rcx) 7938; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7939; SSE-NEXT: movaps %xmm0, 240(%r8) 7940; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7941; SSE-NEXT: movaps %xmm0, 224(%r8) 7942; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7943; SSE-NEXT: movaps %xmm0, 208(%r8) 7944; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7945; SSE-NEXT: movaps %xmm0, 192(%r8) 7946; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7947; SSE-NEXT: movaps %xmm0, 176(%r8) 7948; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7949; SSE-NEXT: movaps %xmm0, 160(%r8) 7950; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7951; SSE-NEXT: movaps %xmm0, 144(%r8) 7952; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7953; SSE-NEXT: movaps %xmm0, 128(%r8) 7954; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7955; SSE-NEXT: movaps %xmm0, 112(%r8) 7956; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7957; SSE-NEXT: movaps %xmm0, 96(%r8) 7958; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7959; SSE-NEXT: movaps %xmm0, 80(%r8) 7960; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7961; SSE-NEXT: movaps %xmm0, 64(%r8) 7962; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7963; SSE-NEXT: movaps %xmm0, 48(%r8) 7964; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7965; SSE-NEXT: movaps %xmm0, 32(%r8) 7966; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 7967; SSE-NEXT: movaps %xmm0, 16(%r8) 7968; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7969; SSE-NEXT: movaps %xmm0, (%r8) 7970; SSE-NEXT: movapd %xmm2, 240(%r9) 7971; SSE-NEXT: movapd %xmm3, 224(%r9) 7972; SSE-NEXT: movapd %xmm5, 208(%r9) 7973; SSE-NEXT: movapd %xmm7, 192(%r9) 7974; SSE-NEXT: movapd %xmm13, 176(%r9) 7975; SSE-NEXT: movapd %xmm8, 160(%r9) 7976; SSE-NEXT: movapd %xmm9, 144(%r9) 7977; SSE-NEXT: movapd %xmm11, 128(%r9) 7978; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7979; SSE-NEXT: movaps %xmm0, 112(%r9) 7980; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7981; SSE-NEXT: movaps %xmm0, 96(%r9) 7982; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7983; SSE-NEXT: movaps %xmm0, 80(%r9) 7984; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7985; SSE-NEXT: movaps %xmm0, 64(%r9) 7986; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7987; SSE-NEXT: movaps %xmm0, 48(%r9) 7988; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7989; SSE-NEXT: movaps %xmm0, 32(%r9) 7990; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7991; SSE-NEXT: movaps %xmm0, 16(%r9) 7992; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7993; SSE-NEXT: movaps %xmm0, (%r9) 7994; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 7995; SSE-NEXT: movapd %xmm14, 240(%rax) 7996; SSE-NEXT: movapd %xmm12, 224(%rax) 7997; SSE-NEXT: movapd %xmm15, 208(%rax) 7998; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7999; SSE-NEXT: movaps %xmm0, 192(%rax) 8000; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8001; SSE-NEXT: movaps %xmm0, 176(%rax) 8002; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8003; SSE-NEXT: movaps %xmm0, 160(%rax) 8004; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8005; SSE-NEXT: movaps %xmm0, 144(%rax) 8006; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8007; SSE-NEXT: movaps %xmm0, 128(%rax) 8008; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8009; SSE-NEXT: movaps %xmm0, 112(%rax) 8010; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8011; SSE-NEXT: movaps %xmm0, 96(%rax) 8012; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8013; SSE-NEXT: movaps %xmm0, 80(%rax) 8014; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8015; SSE-NEXT: movaps %xmm0, 64(%rax) 8016; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8017; SSE-NEXT: movaps %xmm0, 48(%rax) 8018; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8019; SSE-NEXT: movaps %xmm0, 32(%rax) 8020; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8021; SSE-NEXT: movaps %xmm0, 16(%rax) 8022; SSE-NEXT: movapd %xmm4, (%rax) 8023; SSE-NEXT: addq $2184, %rsp # imm = 0x888 8024; SSE-NEXT: retq 8025; 8026; AVX-LABEL: load_i32_stride6_vf64: 8027; AVX: # %bb.0: 8028; AVX-NEXT: subq $2584, %rsp # imm = 0xA18 8029; AVX-NEXT: vmovaps 608(%rdi), %ymm6 8030; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8031; AVX-NEXT: vmovaps 672(%rdi), %ymm2 8032; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8033; AVX-NEXT: vmovaps 640(%rdi), %ymm3 8034; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8035; AVX-NEXT: vmovapd 352(%rdi), %ymm4 8036; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8037; AVX-NEXT: vmovapd 320(%rdi), %ymm5 8038; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8039; AVX-NEXT: vmovaps 224(%rdi), %ymm7 8040; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8041; AVX-NEXT: vmovaps 192(%rdi), %ymm8 8042; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8043; AVX-NEXT: vmovaps 288(%rdi), %ymm1 8044; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8045; AVX-NEXT: vmovaps 256(%rdi), %ymm0 8046; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8047; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm9 8048; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8049; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm9[0,0],ymm1[6,4],ymm9[4,4] 8050; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[2,2],ymm0[6,4],ymm9[6,6] 8051; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] 8052; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8053; AVX-NEXT: vextractf128 $1, %ymm1, %xmm9 8054; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3] 8055; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,3] 8056; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 8057; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm4[0,1] 8058; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8059; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[3],ymm5[2] 8060; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 8061; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8062; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8063; AVX-NEXT: vinsertf128 $1, 672(%rdi), %ymm3, %ymm1 8064; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8065; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4] 8066; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,2],ymm0[6,4],ymm1[6,6] 8067; AVX-NEXT: vmovaps 576(%rdi), %ymm1 8068; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8069; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7] 8070; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8071; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 8072; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8073; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 8074; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,3] 8075; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 8076; AVX-NEXT: vmovapd 736(%rdi), %ymm1 8077; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8078; AVX-NEXT: vmovapd 704(%rdi), %ymm2 8079; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8080; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] 8081; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8082; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2] 8083; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 8084; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8085; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8086; AVX-NEXT: vmovaps 1056(%rdi), %ymm1 8087; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8088; AVX-NEXT: vmovaps 1024(%rdi), %ymm0 8089; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8090; AVX-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm2 8091; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8092; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm2[0,0],ymm1[6,4],ymm2[4,4] 8093; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,2],ymm0[6,4],ymm2[6,6] 8094; AVX-NEXT: vmovaps 992(%rdi), %ymm1 8095; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8096; AVX-NEXT: vmovaps 960(%rdi), %ymm2 8097; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8098; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] 8099; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8100; AVX-NEXT: vextractf128 $1, %ymm1, %xmm13 8101; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] 8102; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm13[0,3] 8103; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 8104; AVX-NEXT: vmovapd 1120(%rdi), %ymm1 8105; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8106; AVX-NEXT: vmovapd 1088(%rdi), %ymm2 8107; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8108; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] 8109; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8110; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2] 8111; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 8112; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8113; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8114; AVX-NEXT: vmovaps 1440(%rdi), %ymm1 8115; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8116; AVX-NEXT: vmovaps 1408(%rdi), %ymm0 8117; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8118; AVX-NEXT: vinsertf128 $1, 1440(%rdi), %ymm0, %ymm11 8119; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm11[0,0],ymm1[6,4],ymm11[4,4] 8120; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm11[2,2],ymm0[6,4],ymm11[6,6] 8121; AVX-NEXT: vmovaps 1376(%rdi), %ymm1 8122; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8123; AVX-NEXT: vmovaps 1344(%rdi), %ymm2 8124; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8125; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] 8126; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8127; AVX-NEXT: vextractf128 $1, %ymm1, %xmm10 8128; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] 8129; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[0,3] 8130; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 8131; AVX-NEXT: vmovapd 1504(%rdi), %ymm1 8132; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8133; AVX-NEXT: vmovapd 1472(%rdi), %ymm2 8134; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8135; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] 8136; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8137; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2] 8138; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 8139; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8140; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8141; AVX-NEXT: vmovaps 96(%rdi), %ymm1 8142; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8143; AVX-NEXT: vmovaps 64(%rdi), %ymm0 8144; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8145; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm8 8146; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm8[0,0],ymm1[6,4],ymm8[4,4] 8147; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,2],ymm0[6,4],ymm8[6,6] 8148; AVX-NEXT: vmovaps 32(%rdi), %ymm1 8149; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8150; AVX-NEXT: vmovaps (%rdi), %ymm2 8151; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8152; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] 8153; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8154; AVX-NEXT: vextractf128 $1, %ymm1, %xmm7 8155; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] 8156; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,3] 8157; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 8158; AVX-NEXT: vmovapd 160(%rdi), %ymm1 8159; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8160; AVX-NEXT: vmovapd 128(%rdi), %ymm2 8161; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8162; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] 8163; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8164; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2] 8165; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 8166; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8167; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8168; AVX-NEXT: vmovaps 480(%rdi), %ymm1 8169; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8170; AVX-NEXT: vmovaps 448(%rdi), %ymm0 8171; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8172; AVX-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm6 8173; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm6[0,0],ymm1[6,4],ymm6[4,4] 8174; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,2],ymm0[6,4],ymm6[6,6] 8175; AVX-NEXT: vmovaps 416(%rdi), %ymm1 8176; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8177; AVX-NEXT: vmovaps 384(%rdi), %ymm2 8178; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8179; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] 8180; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8181; AVX-NEXT: vextractf128 $1, %ymm1, %xmm4 8182; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] 8183; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,3] 8184; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 8185; AVX-NEXT: vmovapd 544(%rdi), %ymm1 8186; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8187; AVX-NEXT: vmovapd 512(%rdi), %ymm2 8188; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8189; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] 8190; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8191; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2] 8192; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 8193; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8194; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8195; AVX-NEXT: vmovaps 864(%rdi), %ymm1 8196; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8197; AVX-NEXT: vmovaps 832(%rdi), %ymm0 8198; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8199; AVX-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm3 8200; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm3[0,0],ymm1[6,4],ymm3[4,4] 8201; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,2],ymm0[6,4],ymm3[6,6] 8202; AVX-NEXT: vmovaps 800(%rdi), %ymm1 8203; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8204; AVX-NEXT: vmovaps 768(%rdi), %ymm2 8205; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8206; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] 8207; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8208; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 8209; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] 8210; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,3] 8211; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 8212; AVX-NEXT: vmovapd 928(%rdi), %ymm1 8213; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8214; AVX-NEXT: vmovapd 896(%rdi), %ymm5 8215; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8216; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[0,1] 8217; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8218; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[3],ymm5[2] 8219; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 8220; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8221; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8222; AVX-NEXT: vmovaps 1248(%rdi), %ymm5 8223; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8224; AVX-NEXT: vmovaps 1216(%rdi), %ymm0 8225; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8226; AVX-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm1 8227; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8228; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm5[2,0],ymm1[0,0],ymm5[6,4],ymm1[4,4] 8229; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm0[2,0],ymm1[2,2],ymm0[6,4],ymm1[6,6] 8230; AVX-NEXT: vmovaps 1184(%rdi), %ymm0 8231; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8232; AVX-NEXT: vmovaps 1152(%rdi), %ymm5 8233; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8234; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7] 8235; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8236; AVX-NEXT: vextractf128 $1, %ymm0, %xmm5 8237; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1],xmm5[2,3] 8238; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,2],xmm5[0,3] 8239; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] 8240; AVX-NEXT: vmovapd 1312(%rdi), %ymm12 8241; AVX-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8242; AVX-NEXT: vmovapd 1280(%rdi), %ymm0 8243; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8244; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm12[0,1] 8245; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8246; AVX-NEXT: vshufpd {{.*#+}} ymm15 = ymm1[0],ymm0[1],ymm1[3],ymm0[2] 8247; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] 8248; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 8249; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8250; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8251; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 8252; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm12[3,0],ymm0[1,0],ymm12[7,4],ymm0[5,4] 8253; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm0[2,3],ymm14[6,4],ymm0[6,7] 8254; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8255; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm9[3,0] 8256; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm9[1,3] 8257; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] 8258; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 8259; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8260; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm0[3,1],ymm12[1,3],ymm0[7,5],ymm12[5,7] 8261; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] 8262; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm15[6,7] 8263; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8264; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 8265; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8266; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm9[3,0],ymm0[1,0],ymm9[7,4],ymm0[5,4] 8267; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm0[2,3],ymm14[6,4],ymm0[6,7] 8268; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8269; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8270; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm1[3,0] 8271; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm1[1,3] 8272; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] 8273; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8274; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload 8275; AVX-NEXT: # ymm15 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7] 8276; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] 8277; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 8278; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8279; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8280; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8281; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,0],ymm1[1,0],ymm0[7,4],ymm1[5,4] 8282; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm1[2,3],ymm14[6,4],ymm1[6,7] 8283; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8284; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm13[3,0] 8285; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm15[0,2],xmm13[1,3] 8286; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] 8287; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 8288; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8289; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,1],ymm15[1,3],ymm0[7,5],ymm15[5,7] 8290; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] 8291; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] 8292; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8293; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 8294; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm14[3,0],ymm11[1,0],ymm14[7,4],ymm11[5,4] 8295; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,0],ymm11[2,3],ymm13[6,4],ymm11[6,7] 8296; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8297; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm0[1,0],xmm10[3,0] 8298; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm13[0,2],xmm10[1,3] 8299; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] 8300; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 8301; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8302; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm0[3,1],ymm13[1,3],ymm0[7,5],ymm13[5,7] 8303; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] 8304; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] 8305; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8306; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 8307; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm11[3,0],ymm8[1,0],ymm11[7,4],ymm8[5,4] 8308; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm8[2,3],ymm10[6,4],ymm8[6,7] 8309; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8310; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm0[1,0],xmm7[3,0] 8311; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm10[0,2],xmm7[1,3] 8312; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] 8313; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8314; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload 8315; AVX-NEXT: # ymm8 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7] 8316; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] 8317; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] 8318; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8319; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 8320; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm8[3,0],ymm6[1,0],ymm8[7,4],ymm6[5,4] 8321; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] 8322; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8323; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm0[1,0],xmm4[3,0] 8324; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm7[0,2],xmm4[1,3] 8325; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] 8326; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 8327; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8328; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm0[3,1],ymm7[1,3],ymm0[7,5],ymm7[5,7] 8329; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] 8330; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] 8331; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8332; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8333; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm6[3,0],ymm3[1,0],ymm6[7,4],ymm3[5,4] 8334; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] 8335; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8336; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm0[1,0],xmm2[3,0] 8337; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm4[0,2],xmm2[1,3] 8338; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] 8339; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 8340; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8341; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,1],ymm4[1,3],ymm0[7,5],ymm4[5,7] 8342; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] 8343; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 8344; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8345; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 8346; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8347; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm0[1,0],ymm3[7,4],ymm0[5,4] 8348; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] 8349; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8350; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm5[3,0] 8351; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm5[1,3] 8352; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 8353; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 8354; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload 8355; AVX-NEXT: # ymm1 = ymm2[3,1],mem[1,3],ymm2[7,5],mem[5,7] 8356; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 8357; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 8358; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8359; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8360; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 8361; AVX-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 8362; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8363; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8364; AVX-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8365; AVX-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] 8366; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 8367; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 8368; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8369; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] 8370; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 8371; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload 8372; AVX-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7] 8373; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,0,1] 8374; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8375; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm12[2,0],ymm1[4,4],ymm12[6,4] 8376; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 8377; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8378; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8379; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 8380; AVX-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 8381; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8382; AVX-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload 8383; AVX-NEXT: # ymm0 = ymm9[2,1],mem[2,0],ymm9[6,5],mem[6,4] 8384; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 8385; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 8386; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8387; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] 8388; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 8389; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8390; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload 8391; AVX-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] 8392; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8393; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] 8394; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8395; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] 8396; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 8397; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8398; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8399; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 8400; AVX-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 8401; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8402; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8403; AVX-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8404; AVX-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] 8405; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 8406; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 8407; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8408; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] 8409; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 8410; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload 8411; AVX-NEXT: # ymm2 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] 8412; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8413; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] 8414; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8415; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] 8416; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 8417; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8418; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8419; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 8420; AVX-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 8421; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8422; AVX-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload 8423; AVX-NEXT: # ymm0 = ymm14[2,1],mem[2,0],ymm14[6,5],mem[6,4] 8424; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 8425; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 8426; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8427; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] 8428; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 8429; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload 8430; AVX-NEXT: # ymm10 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] 8431; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,0,1] 8432; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8433; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm10[2,0],ymm1[4,4],ymm10[6,4] 8434; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 8435; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8436; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8437; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 8438; AVX-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 8439; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8440; AVX-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload 8441; AVX-NEXT: # ymm0 = ymm11[2,1],mem[2,0],ymm11[6,5],mem[6,4] 8442; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 8443; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 8444; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8445; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] 8446; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 8447; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8448; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload 8449; AVX-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] 8450; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8451; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] 8452; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8453; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] 8454; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 8455; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8456; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8457; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 8458; AVX-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 8459; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8460; AVX-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload 8461; AVX-NEXT: # ymm0 = ymm8[2,1],mem[2,0],ymm8[6,5],mem[6,4] 8462; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 8463; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 8464; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8465; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] 8466; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 8467; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload 8468; AVX-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] 8469; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,0,1] 8470; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8471; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm9[2,0],ymm1[4,4],ymm9[6,4] 8472; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 8473; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8474; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8475; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload 8476; AVX-NEXT: # ymm8 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 8477; AVX-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload 8478; AVX-NEXT: # ymm0 = ymm6[2,1],mem[2,0],ymm6[6,5],mem[6,4] 8479; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 8480; AVX-NEXT: vextractf128 $1, %ymm8, %xmm1 8481; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8482; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,0],xmm1[2,3] 8483; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 8484; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload 8485; AVX-NEXT: # ymm6 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] 8486; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3,0,1] 8487; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0],ymm6[2,0],ymm13[4,4],ymm6[6,4] 8488; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 8489; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8490; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8491; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 8492; AVX-NEXT: # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 8493; AVX-NEXT: vmovaps %ymm3, %ymm4 8494; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 8495; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,1],ymm3[2,0],ymm4[6,5],ymm3[6,4] 8496; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] 8497; AVX-NEXT: vextractf128 $1, %ymm5, %xmm7 8498; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm5[2,0],xmm7[2,3] 8499; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] 8500; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8501; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload 8502; AVX-NEXT: # ymm15 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] 8503; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm15[2,3,0,1] 8504; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,0],ymm15[2,0],ymm2[4,4],ymm15[6,4] 8505; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] 8506; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8507; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8508; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm12[3,1],ymm0[4,5],ymm12[7,5] 8509; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8510; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload 8511; AVX-NEXT: # xmm11 = xmm1[3,1],mem[3,3] 8512; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 8513; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8514; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm1[3,1],ymm14[2,1],ymm1[7,5],ymm14[6,5] 8515; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] 8516; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] 8517; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7] 8518; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8519; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8520; AVX-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8521; AVX-NEXT: # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5] 8522; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8523; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload 8524; AVX-NEXT: # xmm11 = xmm1[3,1],mem[3,3] 8525; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 8526; AVX-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload 8527; AVX-NEXT: # ymm12 = ymm12[3,1],mem[2,1],ymm12[7,5],mem[6,5] 8528; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] 8529; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] 8530; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7] 8531; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8532; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8533; AVX-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 8534; AVX-NEXT: # ymm1 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5] 8535; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8536; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload 8537; AVX-NEXT: # xmm11 = xmm0[3,1],mem[3,3] 8538; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8539; AVX-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload 8540; AVX-NEXT: # ymm12 = ymm0[3,1],mem[2,1],ymm0[7,5],mem[6,5] 8541; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] 8542; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] 8543; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm1[5,6,7] 8544; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8545; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8546; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm10[3,1],ymm0[4,5],ymm10[7,5] 8547; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8548; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload 8549; AVX-NEXT: # xmm10 = xmm1[3,1],mem[3,3] 8550; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 8551; AVX-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 8552; AVX-NEXT: # ymm11 = ymm11[3,1],mem[2,1],ymm11[7,5],mem[6,5] 8553; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,0,1] 8554; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] 8555; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7] 8556; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8557; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm15[3,1],ymm2[4,5],ymm15[7,5] 8558; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[3,1],xmm7[3,3] 8559; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm3[2,1],ymm4[7,5],ymm3[6,5] 8560; AVX-NEXT: vmovaps %ymm4, %ymm15 8561; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] 8562; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 8563; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] 8564; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8565; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,1],ymm6[3,1],ymm13[4,5],ymm6[7,5] 8566; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload 8567; AVX-NEXT: # xmm1 = xmm8[3,1],mem[3,3] 8568; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 8569; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8570; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm6[2,1],ymm3[7,5],ymm6[6,5] 8571; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] 8572; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 8573; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] 8574; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8575; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8576; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm9[3,1],ymm0[4,5],ymm9[7,5] 8577; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8578; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 8579; AVX-NEXT: # xmm1 = xmm1[3,1],mem[3,3] 8580; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 8581; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 8582; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm8[2,1],ymm4[7,5],ymm8[6,5] 8583; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] 8584; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 8585; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] 8586; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8587; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8588; AVX-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8589; AVX-NEXT: # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5] 8590; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8591; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 8592; AVX-NEXT: # xmm1 = xmm1[3,1],mem[3,3] 8593; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 8594; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 8595; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm7[2,1],ymm5[7,5],ymm7[6,5] 8596; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] 8597; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 8598; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] 8599; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8600; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8601; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 8602; AVX-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] 8603; AVX-NEXT: vmovaps 32(%rdi), %xmm0 8604; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8605; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] 8606; AVX-NEXT: vmovaps 16(%rdi), %xmm1 8607; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8608; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 8609; AVX-NEXT: vmovapd 80(%rdi), %xmm1 8610; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8611; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[3] 8612; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] 8613; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 8614; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8615; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3,0,1] 8616; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm10[0,0],ymm2[6,4],ymm10[4,4] 8617; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm1[2,0],ymm10[4,6],ymm1[6,4] 8618; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 8619; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8620; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8621; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 8622; AVX-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] 8623; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8624; AVX-NEXT: vmovaps 224(%rdi), %xmm0 8625; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8626; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] 8627; AVX-NEXT: vmovaps 208(%rdi), %xmm1 8628; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8629; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 8630; AVX-NEXT: vmovapd 272(%rdi), %xmm1 8631; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8632; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm14[0],ymm1[2],ymm14[3] 8633; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 8634; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] 8635; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 8636; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] 8637; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm7[0,0],ymm2[6,4],ymm7[4,4] 8638; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm1[2,0],ymm7[4,6],ymm1[6,4] 8639; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 8640; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8641; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8642; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 8643; AVX-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] 8644; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8645; AVX-NEXT: vmovaps 416(%rdi), %xmm0 8646; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8647; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] 8648; AVX-NEXT: vmovaps 400(%rdi), %xmm1 8649; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8650; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 8651; AVX-NEXT: vmovapd 464(%rdi), %xmm1 8652; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8653; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm8[0],ymm1[2],ymm8[3] 8654; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] 8655; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 8656; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] 8657; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm5[0,0],ymm2[6,4],ymm5[4,4] 8658; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm1[2,0],ymm5[4,6],ymm1[6,4] 8659; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 8660; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8661; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8662; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload 8663; AVX-NEXT: # ymm12 = mem[0,1],ymm0[2,3],mem[4,5,6,7] 8664; AVX-NEXT: vmovaps 608(%rdi), %xmm0 8665; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8666; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] 8667; AVX-NEXT: vmovaps 592(%rdi), %xmm1 8668; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8669; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 8670; AVX-NEXT: vmovapd 656(%rdi), %xmm1 8671; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8672; AVX-NEXT: vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 8673; AVX-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] 8674; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 8675; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] 8676; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 8677; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm12[2,3,0,1] 8678; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,0],ymm4[0,0],ymm12[6,4],ymm4[4,4] 8679; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm1[2,0],ymm4[4,6],ymm1[6,4] 8680; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 8681; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8682; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8683; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload 8684; AVX-NEXT: # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7] 8685; AVX-NEXT: vmovaps 800(%rdi), %xmm0 8686; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8687; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] 8688; AVX-NEXT: vmovaps 784(%rdi), %xmm1 8689; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8690; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 8691; AVX-NEXT: vmovapd 848(%rdi), %xmm1 8692; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8693; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[3] 8694; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] 8695; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 8696; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[2,3,0,1] 8697; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,0],ymm3[0,0],ymm11[6,4],ymm3[4,4] 8698; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm1[2,0],ymm3[4,6],ymm1[6,4] 8699; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 8700; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8701; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8702; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 8703; AVX-NEXT: # ymm14 = mem[0,1],ymm0[2,3],mem[4,5,6,7] 8704; AVX-NEXT: vmovaps 992(%rdi), %xmm0 8705; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8706; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] 8707; AVX-NEXT: vmovaps 976(%rdi), %xmm1 8708; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8709; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 8710; AVX-NEXT: vmovapd 1040(%rdi), %xmm1 8711; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8712; AVX-NEXT: vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 8713; AVX-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] 8714; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 8715; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] 8716; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 8717; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm14[2,3,0,1] 8718; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,0],ymm2[0,0],ymm14[6,4],ymm2[4,4] 8719; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,0],ymm2[4,6],ymm1[6,4] 8720; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 8721; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8722; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8723; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload 8724; AVX-NEXT: # ymm8 = mem[0,1],ymm0[2,3],mem[4,5,6,7] 8725; AVX-NEXT: vmovaps 1184(%rdi), %xmm0 8726; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8727; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] 8728; AVX-NEXT: vmovaps 1168(%rdi), %xmm1 8729; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8730; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 8731; AVX-NEXT: vmovapd 1232(%rdi), %xmm1 8732; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8733; AVX-NEXT: vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 8734; AVX-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] 8735; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,0],ymm15[4,5],ymm1[6,4] 8736; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 8737; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] 8738; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,0],ymm1[0,0],ymm8[6,4],ymm1[4,4] 8739; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm1[0,2],ymm6[2,0],ymm1[4,6],ymm6[6,4] 8740; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] 8741; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8742; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8743; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 8744; AVX-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] 8745; AVX-NEXT: vmovaps 1376(%rdi), %xmm0 8746; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8747; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] 8748; AVX-NEXT: vmovaps 1360(%rdi), %xmm9 8749; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8750; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3] 8751; AVX-NEXT: vmovapd 1424(%rdi), %xmm9 8752; AVX-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8753; AVX-NEXT: vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload 8754; AVX-NEXT: # ymm15 = ymm9[1],mem[0],ymm9[2],mem[3] 8755; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 8756; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,1],ymm15[2,0],ymm9[4,5],ymm15[6,4] 8757; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2,3,4,5,6,7] 8758; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] 8759; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm6[2,0],ymm0[0,0],ymm6[6,4],ymm0[4,4] 8760; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,2],ymm13[2,0],ymm0[4,6],ymm13[6,4] 8761; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4],ymm13[5,6,7] 8762; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8763; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 8764; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm9[3,0],ymm10[1,0],ymm9[7,4],ymm10[5,4] 8765; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,3],ymm13[2,0],ymm10[4,7],ymm13[6,4] 8766; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 8767; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload 8768; AVX-NEXT: # xmm13 = xmm10[0,1],mem[2,3] 8769; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 8770; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload 8771; AVX-NEXT: # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] 8772; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 8773; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4] 8774; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] 8775; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] 8776; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7] 8777; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 8778; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm10[3,0],ymm7[1,0],ymm10[7,4],ymm7[5,4] 8779; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,3],ymm13[2,0],ymm7[4,7],ymm13[6,4] 8780; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 8781; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload 8782; AVX-NEXT: # xmm13 = xmm10[0,1],mem[2,3] 8783; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 8784; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload 8785; AVX-NEXT: # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] 8786; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 8787; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4] 8788; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] 8789; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] 8790; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3,4],ymm7[5,6,7] 8791; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 8792; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm10[3,0],ymm5[1,0],ymm10[7,4],ymm5[5,4] 8793; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm13[2,0],ymm5[4,7],ymm13[6,4] 8794; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 8795; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload 8796; AVX-NEXT: # xmm13 = xmm10[0,1],mem[2,3] 8797; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 8798; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload 8799; AVX-NEXT: # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] 8800; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 8801; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4] 8802; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] 8803; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] 8804; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7] 8805; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0],ymm4[1,0],ymm12[7,4],ymm4[5,4] 8806; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,3],ymm12[2,0],ymm4[4,7],ymm12[6,4] 8807; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 8808; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload 8809; AVX-NEXT: # xmm12 = xmm10[0,1],mem[2,3] 8810; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 8811; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload 8812; AVX-NEXT: # ymm13 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] 8813; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 8814; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm10[1,1],ymm13[2,0],ymm10[5,5],ymm13[6,4] 8815; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,3,2,3] 8816; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] 8817; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5,6,7] 8818; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm3[1,0],ymm11[7,4],ymm3[5,4] 8819; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm11[2,0],ymm3[4,7],ymm11[6,4] 8820; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 8821; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm11 # 16-byte Folded Reload 8822; AVX-NEXT: # xmm11 = xmm10[0,1],mem[2,3] 8823; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 8824; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload 8825; AVX-NEXT: # ymm12 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] 8826; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 8827; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm10[1,1],ymm12[2,0],ymm10[5,5],ymm12[6,4] 8828; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,3,2,3] 8829; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7] 8830; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5,6,7] 8831; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm14[3,0],ymm2[1,0],ymm14[7,4],ymm2[5,4] 8832; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,3],ymm10[2,0],ymm2[4,7],ymm10[6,4] 8833; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 8834; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload 8835; AVX-NEXT: # xmm10 = xmm10[0,1],mem[2,3] 8836; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 8837; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 8838; AVX-NEXT: # ymm11 = ymm11[3,1],mem[1,3],ymm11[7,5],mem[5,7] 8839; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 8840; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm12[1,1],ymm11[2,0],ymm12[5,5],ymm11[6,4] 8841; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm10[1,3,2,3] 8842; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3,4,5,6,7] 8843; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4],ymm2[5,6,7] 8844; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm1[1,0],ymm8[7,4],ymm1[5,4] 8845; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm8[2,0],ymm1[4,7],ymm8[6,4] 8846; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 8847; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload 8848; AVX-NEXT: # xmm8 = xmm8[0,1],mem[2,3] 8849; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 8850; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload 8851; AVX-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] 8852; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 8853; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm11[1,1],ymm10[2,0],ymm11[5,5],ymm10[6,4] 8854; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3] 8855; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5,6,7] 8856; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7] 8857; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm0[1,0],ymm6[7,4],ymm0[5,4] 8858; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm6[2,0],ymm0[4,7],ymm6[6,4] 8859; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 8860; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload 8861; AVX-NEXT: # xmm6 = xmm6[0,1],mem[2,3] 8862; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 8863; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload 8864; AVX-NEXT: # ymm8 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] 8865; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 8866; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm10[1,1],ymm8[2,0],ymm10[5,5],ymm8[6,4] 8867; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,3,2,3] 8868; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5,6,7] 8869; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7] 8870; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8871; AVX-NEXT: vmovaps %ymm6, 192(%rsi) 8872; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8873; AVX-NEXT: vmovaps %ymm6, 128(%rsi) 8874; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8875; AVX-NEXT: vmovaps %ymm6, 64(%rsi) 8876; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8877; AVX-NEXT: vmovaps %ymm6, (%rsi) 8878; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8879; AVX-NEXT: vmovaps %ymm6, 224(%rsi) 8880; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8881; AVX-NEXT: vmovaps %ymm6, 160(%rsi) 8882; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8883; AVX-NEXT: vmovaps %ymm6, 96(%rsi) 8884; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8885; AVX-NEXT: vmovaps %ymm6, 32(%rsi) 8886; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8887; AVX-NEXT: vmovaps %ymm6, 192(%rdx) 8888; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8889; AVX-NEXT: vmovaps %ymm6, 128(%rdx) 8890; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8891; AVX-NEXT: vmovaps %ymm6, 64(%rdx) 8892; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8893; AVX-NEXT: vmovaps %ymm6, (%rdx) 8894; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8895; AVX-NEXT: vmovaps %ymm6, 224(%rdx) 8896; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8897; AVX-NEXT: vmovaps %ymm6, 160(%rdx) 8898; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8899; AVX-NEXT: vmovaps %ymm6, 96(%rdx) 8900; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8901; AVX-NEXT: vmovaps %ymm6, 32(%rdx) 8902; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8903; AVX-NEXT: vmovaps %ymm6, 192(%rcx) 8904; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8905; AVX-NEXT: vmovaps %ymm6, 128(%rcx) 8906; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8907; AVX-NEXT: vmovaps %ymm6, 64(%rcx) 8908; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8909; AVX-NEXT: vmovaps %ymm6, (%rcx) 8910; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8911; AVX-NEXT: vmovaps %ymm6, 224(%rcx) 8912; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8913; AVX-NEXT: vmovaps %ymm6, 160(%rcx) 8914; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8915; AVX-NEXT: vmovaps %ymm6, 96(%rcx) 8916; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8917; AVX-NEXT: vmovaps %ymm6, 32(%rcx) 8918; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8919; AVX-NEXT: vmovaps %ymm6, (%r8) 8920; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8921; AVX-NEXT: vmovaps %ymm6, 64(%r8) 8922; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8923; AVX-NEXT: vmovaps %ymm6, 128(%r8) 8924; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8925; AVX-NEXT: vmovaps %ymm6, 192(%r8) 8926; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8927; AVX-NEXT: vmovaps %ymm6, 224(%r8) 8928; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8929; AVX-NEXT: vmovaps %ymm6, 160(%r8) 8930; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8931; AVX-NEXT: vmovaps %ymm6, 96(%r8) 8932; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8933; AVX-NEXT: vmovaps %ymm6, 32(%r8) 8934; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8935; AVX-NEXT: vmovaps %ymm6, 224(%r9) 8936; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8937; AVX-NEXT: vmovaps %ymm6, 192(%r9) 8938; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8939; AVX-NEXT: vmovaps %ymm6, 160(%r9) 8940; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8941; AVX-NEXT: vmovaps %ymm6, 128(%r9) 8942; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8943; AVX-NEXT: vmovaps %ymm6, 96(%r9) 8944; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8945; AVX-NEXT: vmovaps %ymm6, 64(%r9) 8946; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8947; AVX-NEXT: vmovaps %ymm6, 32(%r9) 8948; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8949; AVX-NEXT: vmovaps %ymm6, (%r9) 8950; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 8951; AVX-NEXT: vmovaps %ymm0, 224(%rax) 8952; AVX-NEXT: vmovaps %ymm1, 192(%rax) 8953; AVX-NEXT: vmovaps %ymm2, 160(%rax) 8954; AVX-NEXT: vmovaps %ymm3, 128(%rax) 8955; AVX-NEXT: vmovaps %ymm4, 96(%rax) 8956; AVX-NEXT: vmovaps %ymm5, 64(%rax) 8957; AVX-NEXT: vmovaps %ymm7, 32(%rax) 8958; AVX-NEXT: vmovaps %ymm9, (%rax) 8959; AVX-NEXT: addq $2584, %rsp # imm = 0xA18 8960; AVX-NEXT: vzeroupper 8961; AVX-NEXT: retq 8962; 8963; AVX2-LABEL: load_i32_stride6_vf64: 8964; AVX2: # %bb.0: 8965; AVX2-NEXT: subq $2568, %rsp # imm = 0xA08 8966; AVX2-NEXT: vmovaps 672(%rdi), %ymm4 8967; AVX2-NEXT: vmovaps 640(%rdi), %ymm5 8968; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8969; AVX2-NEXT: vmovaps 608(%rdi), %ymm3 8970; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8971; AVX2-NEXT: vmovaps 320(%rdi), %ymm6 8972; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8973; AVX2-NEXT: vmovaps 352(%rdi), %ymm7 8974; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8975; AVX2-NEXT: vmovaps 288(%rdi), %ymm2 8976; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8977; AVX2-NEXT: vmovaps 256(%rdi), %ymm8 8978; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8979; AVX2-NEXT: vmovaps 224(%rdi), %ymm0 8980; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8981; AVX2-NEXT: vmovaps 192(%rdi), %ymm1 8982; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8983; AVX2-NEXT: vmovaps {{.*#+}} xmm9 = [0,6,4,u] 8984; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 8985; AVX2-NEXT: vpermps %ymm14, %ymm9, %ymm0 8986; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[0,1],ymm2[0,1] 8987; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5],ymm2[6,7] 8988; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,2,2,2,4,6,6,6] 8989; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 8990; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] 8991; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8992; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2] 8993; AVX2-NEXT: vpermps %ymm1, %ymm6, %ymm2 8994; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 8995; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8996; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8997; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[0,1],ymm4[0,1] 8998; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm4[6,7] 8999; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9000; AVX2-NEXT: vmovaps 576(%rdi), %ymm0 9001; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9002; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] 9003; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9004; AVX2-NEXT: vpermps %ymm0, %ymm9, %ymm0 9005; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2,2,2,4,6,6,6] 9006; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 9007; AVX2-NEXT: vmovaps 704(%rdi), %ymm1 9008; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9009; AVX2-NEXT: vmovaps 736(%rdi), %ymm2 9010; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9011; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 9012; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9013; AVX2-NEXT: vpermps %ymm1, %ymm6, %ymm2 9014; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 9015; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9016; AVX2-NEXT: vmovaps 1056(%rdi), %ymm1 9017; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9018; AVX2-NEXT: vmovaps 1024(%rdi), %ymm0 9019; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9020; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 9021; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] 9022; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9023; AVX2-NEXT: vmovaps 992(%rdi), %ymm0 9024; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9025; AVX2-NEXT: vmovaps 960(%rdi), %ymm1 9026; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9027; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 9028; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9029; AVX2-NEXT: vpermps %ymm0, %ymm9, %ymm0 9030; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6] 9031; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 9032; AVX2-NEXT: vmovaps 1088(%rdi), %ymm1 9033; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9034; AVX2-NEXT: vmovaps 1120(%rdi), %ymm2 9035; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9036; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 9037; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9038; AVX2-NEXT: vpermps %ymm1, %ymm6, %ymm2 9039; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 9040; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9041; AVX2-NEXT: vmovaps 1440(%rdi), %ymm1 9042; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9043; AVX2-NEXT: vmovaps 1408(%rdi), %ymm0 9044; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9045; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 9046; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] 9047; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9048; AVX2-NEXT: vmovaps 1376(%rdi), %ymm0 9049; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9050; AVX2-NEXT: vmovaps 1344(%rdi), %ymm1 9051; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9052; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 9053; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9054; AVX2-NEXT: vpermps %ymm0, %ymm9, %ymm0 9055; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6] 9056; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 9057; AVX2-NEXT: vmovaps 1472(%rdi), %ymm1 9058; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9059; AVX2-NEXT: vmovaps 1504(%rdi), %ymm2 9060; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9061; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 9062; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9063; AVX2-NEXT: vpermps %ymm1, %ymm6, %ymm2 9064; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 9065; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9066; AVX2-NEXT: vmovaps 96(%rdi), %ymm1 9067; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill 9068; AVX2-NEXT: vmovaps 64(%rdi), %ymm0 9069; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9070; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 9071; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] 9072; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9073; AVX2-NEXT: vmovaps (%rdi), %ymm0 9074; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9075; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 9076; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9077; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 9078; AVX2-NEXT: vpermps %ymm13, %ymm9, %ymm0 9079; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6] 9080; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 9081; AVX2-NEXT: vmovaps 128(%rdi), %ymm1 9082; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9083; AVX2-NEXT: vmovaps 160(%rdi), %ymm2 9084; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9085; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm1[4,5,6,7] 9086; AVX2-NEXT: vpermps %ymm12, %ymm6, %ymm2 9087; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 9088; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9089; AVX2-NEXT: vmovaps 480(%rdi), %ymm1 9090; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9091; AVX2-NEXT: vmovaps 448(%rdi), %ymm0 9092; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9093; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 9094; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm1[6,7] 9095; AVX2-NEXT: vmovaps 416(%rdi), %ymm0 9096; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9097; AVX2-NEXT: vmovaps 384(%rdi), %ymm1 9098; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9099; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 9100; AVX2-NEXT: vpermps %ymm10, %ymm9, %ymm0 9101; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,2,2,2,4,6,6,6] 9102; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 9103; AVX2-NEXT: vmovaps 512(%rdi), %ymm1 9104; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9105; AVX2-NEXT: vmovaps 544(%rdi), %ymm2 9106; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9107; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] 9108; AVX2-NEXT: vpermps %ymm8, %ymm6, %ymm2 9109; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 9110; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9111; AVX2-NEXT: vmovaps 864(%rdi), %ymm1 9112; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9113; AVX2-NEXT: vmovaps 832(%rdi), %ymm0 9114; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9115; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 9116; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7] 9117; AVX2-NEXT: vmovaps 800(%rdi), %ymm0 9118; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9119; AVX2-NEXT: vmovaps 768(%rdi), %ymm1 9120; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9121; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 9122; AVX2-NEXT: vpermps %ymm4, %ymm9, %ymm0 9123; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2,2,2,4,6,6,6] 9124; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 9125; AVX2-NEXT: vmovaps 896(%rdi), %ymm1 9126; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9127; AVX2-NEXT: vmovaps 928(%rdi), %ymm2 9128; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9129; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] 9130; AVX2-NEXT: vpermps %ymm3, %ymm6, %ymm2 9131; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 9132; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9133; AVX2-NEXT: vmovaps 1184(%rdi), %ymm0 9134; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9135; AVX2-NEXT: vmovaps 1152(%rdi), %ymm1 9136; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9137; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 9138; AVX2-NEXT: vpermps %ymm2, %ymm9, %ymm0 9139; AVX2-NEXT: vmovaps 1248(%rdi), %ymm1 9140; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9141; AVX2-NEXT: vmovaps 1216(%rdi), %ymm5 9142; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9143; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[0,1],ymm1[0,1] 9144; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7] 9145; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm9[0,2,2,2,4,6,6,6] 9146; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] 9147; AVX2-NEXT: vmovaps 1280(%rdi), %ymm1 9148; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9149; AVX2-NEXT: vmovaps 1312(%rdi), %ymm5 9150; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9151; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7] 9152; AVX2-NEXT: vpermps %ymm5, %ymm6, %ymm1 9153; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 9154; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9155; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [1,7,5,u] 9156; AVX2-NEXT: vpermps %ymm14, %ymm0, %ymm1 9157; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,3,2,3,5,7,6,7] 9158; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2],ymm15[3,4,5,6,7] 9159; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5,3,5,3,5,3,5,3] 9160; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload 9161; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] 9162; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9163; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 9164; AVX2-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 9165; AVX2-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] 9166; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] 9167; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload 9168; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 9169; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9170; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 9171; AVX2-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 9172; AVX2-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] 9173; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] 9174; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload 9175; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 9176; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9177; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 9178; AVX2-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 9179; AVX2-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] 9180; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] 9181; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload 9182; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 9183; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9184; AVX2-NEXT: vpermps %ymm13, %ymm0, %ymm13 9185; AVX2-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload 9186; AVX2-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] 9187; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] 9188; AVX2-NEXT: vpermps %ymm12, %ymm1, %ymm12 9189; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 9190; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9191; AVX2-NEXT: vpermps %ymm10, %ymm0, %ymm10 9192; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] 9193; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] 9194; AVX2-NEXT: vpermps %ymm8, %ymm1, %ymm8 9195; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7] 9196; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9197; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm4 9198; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] 9199; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] 9200; AVX2-NEXT: vpermps %ymm3, %ymm1, %ymm3 9201; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] 9202; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9203; AVX2-NEXT: vpermps %ymm2, %ymm0, %ymm0 9204; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,3,2,3,5,7,6,7] 9205; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 9206; AVX2-NEXT: vpermps %ymm5, %ymm1, %ymm2 9207; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 9208; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9209; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 9210; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 9211; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] 9212; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 9213; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 9214; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9215; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9216; AVX2-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] 9217; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9218; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] 9219; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 9220; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 9221; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9222; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9223; AVX2-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] 9224; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9225; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] 9226; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 9227; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 9228; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9229; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9230; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 9231; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] 9232; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 9233; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 9234; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9235; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9236; AVX2-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] 9237; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9238; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] 9239; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 9240; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 9241; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9242; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9243; AVX2-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] 9244; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9245; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] 9246; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 9247; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 9248; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9249; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 9250; AVX2-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload 9251; AVX2-NEXT: # ymm0 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] 9252; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 9253; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 9254; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9255; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9256; AVX2-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] 9257; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9258; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] 9259; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 9260; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 9261; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9262; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9263; AVX2-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] 9264; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9265; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] 9266; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 9267; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 9268; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9269; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9270; AVX2-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9271; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] 9272; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 9273; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 9274; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9275; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9276; AVX2-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] 9277; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9278; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] 9279; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 9280; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 9281; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9282; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9283; AVX2-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] 9284; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9285; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] 9286; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 9287; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 9288; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9289; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 9290; AVX2-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9291; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] 9292; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 9293; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 9294; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9295; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9296; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 9297; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9298; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] 9299; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 9300; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 9301; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9302; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9303; AVX2-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] 9304; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9305; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] 9306; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 9307; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 9308; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9309; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9310; AVX2-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9311; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] 9312; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 9313; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,2,0,3] 9314; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9315; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9316; AVX2-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 9317; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9318; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0,2,3,6,4,6,7] 9319; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] 9320; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] 9321; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9322; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9323; AVX2-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] 9324; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9325; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0,2,0,4,4,6,4] 9326; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] 9327; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] 9328; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9329; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 9330; AVX2-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload 9331; AVX2-NEXT: # ymm2 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] 9332; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] 9333; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] 9334; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9335; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 9336; AVX2-NEXT: # ymm14 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 9337; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm14[2,0,2,3,6,4,6,7] 9338; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] 9339; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] 9340; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9341; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload 9342; AVX2-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] 9343; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,2,0,4,4,6,4] 9344; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] 9345; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] 9346; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9347; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 9348; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 9349; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] 9350; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] 9351; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,2,0,3] 9352; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9353; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 9354; AVX2-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 9355; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,0,2,3,6,4,6,7] 9356; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] 9357; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] 9358; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9359; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 9360; AVX2-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] 9361; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,0,2,0,4,4,6,4] 9362; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] 9363; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] 9364; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9365; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] 9366; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] 9367; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 9368; AVX2-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 9369; AVX2-NEXT: # ymm3 = mem[3,1,3,3,7,5,7,7] 9370; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] 9371; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] 9372; AVX2-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 9373; AVX2-NEXT: # ymm3 = mem[0,1,3,1,4,5,7,5] 9374; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 9375; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] 9376; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9377; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7] 9378; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] 9379; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 9380; AVX2-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 9381; AVX2-NEXT: # ymm3 = mem[3,1,3,3,7,5,7,7] 9382; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] 9383; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] 9384; AVX2-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 9385; AVX2-NEXT: # ymm3 = mem[0,1,3,1,4,5,7,5] 9386; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 9387; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] 9388; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9389; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7] 9390; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9391; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] 9392; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 9393; AVX2-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 9394; AVX2-NEXT: # ymm3 = mem[3,1,3,3,7,5,7,7] 9395; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] 9396; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] 9397; AVX2-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 9398; AVX2-NEXT: # ymm3 = mem[0,1,3,1,4,5,7,5] 9399; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 9400; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] 9401; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9402; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 9403; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7] 9404; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 9405; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] 9406; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 9407; AVX2-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 9408; AVX2-NEXT: # ymm3 = mem[3,1,3,3,7,5,7,7] 9409; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] 9410; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] 9411; AVX2-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 9412; AVX2-NEXT: # ymm3 = mem[0,1,3,1,4,5,7,5] 9413; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 9414; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] 9415; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9416; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7] 9417; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] 9418; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 9419; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1,3,3,7,5,7,7] 9420; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 9421; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 9422; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,1,4,5,7,5] 9423; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 9424; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 9425; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9426; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 9427; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] 9428; AVX2-NEXT: vmovaps %ymm13, %ymm5 9429; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] 9430; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 9431; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,1,3,3,7,5,7,7] 9432; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 9433; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 9434; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,1,4,5,7,5] 9435; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 9436; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 9437; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9438; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 9439; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,3,3,3,7,7,7,7] 9440; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 9441; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] 9442; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 9443; AVX2-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 9444; AVX2-NEXT: # ymm2 = mem[3,1,3,3,7,5,7,7] 9445; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 9446; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 9447; AVX2-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 9448; AVX2-NEXT: # ymm2 = mem[0,1,3,1,4,5,7,5] 9449; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 9450; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 9451; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9452; AVX2-NEXT: vmovups (%rsp), %ymm14 # 32-byte Reload 9453; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,3,3,3,7,7,7,7] 9454; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 9455; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] 9456; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 9457; AVX2-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 9458; AVX2-NEXT: # ymm2 = mem[3,1,3,3,7,5,7,7] 9459; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 9460; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 9461; AVX2-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 9462; AVX2-NEXT: # ymm2 = mem[0,1,3,1,4,5,7,5] 9463; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 9464; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 9465; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9466; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm14[4,5,6,7] 9467; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9468; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9469; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 9470; AVX2-NEXT: # ymm3 = mem[0,1,2,3],ymm0[4,5,6,7] 9471; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9472; AVX2-NEXT: vmovaps 80(%rdi), %xmm0 9473; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9474; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 9475; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 9476; AVX2-NEXT: vpermps %ymm3, %ymm6, %ymm2 9477; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 9478; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9479; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9480; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 9481; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9482; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6] 9483; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 9484; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2 9485; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 9486; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9487; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9488; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 9489; AVX2-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] 9490; AVX2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill 9491; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9492; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 9493; AVX2-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] 9494; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9495; AVX2-NEXT: vmovaps 272(%rdi), %xmm0 9496; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9497; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 9498; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 9499; AVX2-NEXT: vpermps %ymm14, %ymm6, %ymm2 9500; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 9501; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9502; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9503; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 9504; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9505; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2 9506; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 9507; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9508; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7] 9509; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9510; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9511; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload 9512; AVX2-NEXT: # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7] 9513; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9514; AVX2-NEXT: vmovaps 464(%rdi), %xmm0 9515; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9516; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 9517; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 9518; AVX2-NEXT: vpermps %ymm12, %ymm6, %ymm2 9519; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 9520; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9521; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9522; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 9523; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9524; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2 9525; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 9526; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9527; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9528; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 9529; AVX2-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] 9530; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9531; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9532; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload 9533; AVX2-NEXT: # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7] 9534; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9535; AVX2-NEXT: vmovaps 656(%rdi), %xmm0 9536; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9537; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 9538; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 9539; AVX2-NEXT: vpermps %ymm12, %ymm6, %ymm2 9540; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 9541; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9542; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9543; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 9544; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9545; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2 9546; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 9547; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9548; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm11[4,5,6,7] 9549; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9550; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9551; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 9552; AVX2-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] 9553; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9554; AVX2-NEXT: vmovaps 848(%rdi), %xmm0 9555; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9556; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 9557; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 9558; AVX2-NEXT: vpermps %ymm5, %ymm6, %ymm2 9559; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 9560; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9561; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9562; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 9563; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9564; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2 9565; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 9566; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9567; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload 9568; AVX2-NEXT: # ymm15 = ymm9[0,1,2,3],mem[4,5,6,7] 9569; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9570; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 9571; AVX2-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] 9572; AVX2-NEXT: vmovaps 1040(%rdi), %xmm13 9573; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] 9574; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 9575; AVX2-NEXT: vpermps %ymm14, %ymm6, %ymm2 9576; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 9577; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9578; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload 9579; AVX2-NEXT: # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 9580; AVX2-NEXT: vpermps %ymm12, %ymm3, %ymm2 9581; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 9582; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9583; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1,2,3],ymm10[4,5,6,7] 9584; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9585; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload 9586; AVX2-NEXT: # ymm10 = ymm0[0,1,2,3],mem[4,5,6,7] 9587; AVX2-NEXT: vmovaps 1232(%rdi), %xmm9 9588; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm9[2,3],ymm11[4,5,6,7] 9589; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 9590; AVX2-NEXT: vpermps %ymm10, %ymm6, %ymm2 9591; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 9592; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9593; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload 9594; AVX2-NEXT: # ymm8 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 9595; AVX2-NEXT: vpermps %ymm8, %ymm3, %ymm2 9596; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 9597; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9598; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7] 9599; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9600; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 9601; AVX2-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] 9602; AVX2-NEXT: vmovaps 1424(%rdi), %xmm4 9603; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] 9604; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 9605; AVX2-NEXT: vpermps %ymm5, %ymm6, %ymm2 9606; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 9607; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9608; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9609; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 9610; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm3 9611; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] 9612; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9613; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 9614; AVX2-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] 9615; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9616; AVX2-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] 9617; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload 9618; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3,4,5,6,7] 9619; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] 9620; AVX2-NEXT: # ymm0 = mem[0,1,0,1] 9621; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 9622; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] 9623; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9624; AVX2-NEXT: vpermilps $85, (%rsp), %ymm3 # 32-byte Folded Reload 9625; AVX2-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] 9626; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 9627; AVX2-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] 9628; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 9629; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] 9630; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 9631; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] 9632; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9633; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 9634; AVX2-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] 9635; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 9636; AVX2-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] 9637; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 9638; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] 9639; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 9640; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] 9641; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9642; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 9643; AVX2-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] 9644; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 9645; AVX2-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] 9646; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 9647; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] 9648; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 9649; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] 9650; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9651; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 9652; AVX2-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] 9653; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 9654; AVX2-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] 9655; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload 9656; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7] 9657; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 9658; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7] 9659; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm15[1,1,1,1,5,5,5,5] 9660; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6,7] 9661; AVX2-NEXT: vpermps %ymm14, %ymm1, %ymm13 9662; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3,4,5,6,7] 9663; AVX2-NEXT: vpermps %ymm12, %ymm0, %ymm12 9664; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4],ymm12[5,6,7] 9665; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,1,1,1,5,5,5,5] 9666; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5,6,7] 9667; AVX2-NEXT: vpermps %ymm10, %ymm1, %ymm9 9668; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5,6,7] 9669; AVX2-NEXT: vpermps %ymm8, %ymm0, %ymm8 9670; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm8[5,6,7] 9671; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,1,1,1,5,5,5,5] 9672; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] 9673; AVX2-NEXT: vpermps %ymm5, %ymm1, %ymm1 9674; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] 9675; AVX2-NEXT: vpermps %ymm2, %ymm0, %ymm0 9676; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] 9677; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9678; AVX2-NEXT: vmovaps %ymm1, 192(%rsi) 9679; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9680; AVX2-NEXT: vmovaps %ymm1, 128(%rsi) 9681; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9682; AVX2-NEXT: vmovaps %ymm1, 64(%rsi) 9683; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9684; AVX2-NEXT: vmovaps %ymm1, (%rsi) 9685; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9686; AVX2-NEXT: vmovaps %ymm1, 224(%rsi) 9687; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9688; AVX2-NEXT: vmovaps %ymm1, 160(%rsi) 9689; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9690; AVX2-NEXT: vmovaps %ymm1, 96(%rsi) 9691; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9692; AVX2-NEXT: vmovaps %ymm1, 32(%rsi) 9693; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9694; AVX2-NEXT: vmovaps %ymm1, 192(%rdx) 9695; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9696; AVX2-NEXT: vmovaps %ymm1, 128(%rdx) 9697; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9698; AVX2-NEXT: vmovaps %ymm1, 64(%rdx) 9699; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9700; AVX2-NEXT: vmovaps %ymm1, (%rdx) 9701; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9702; AVX2-NEXT: vmovaps %ymm1, 224(%rdx) 9703; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9704; AVX2-NEXT: vmovaps %ymm1, 160(%rdx) 9705; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9706; AVX2-NEXT: vmovaps %ymm1, 96(%rdx) 9707; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9708; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) 9709; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9710; AVX2-NEXT: vmovaps %ymm1, 192(%rcx) 9711; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9712; AVX2-NEXT: vmovaps %ymm1, 128(%rcx) 9713; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9714; AVX2-NEXT: vmovaps %ymm1, 64(%rcx) 9715; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9716; AVX2-NEXT: vmovaps %ymm1, (%rcx) 9717; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9718; AVX2-NEXT: vmovaps %ymm1, 224(%rcx) 9719; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9720; AVX2-NEXT: vmovaps %ymm1, 160(%rcx) 9721; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9722; AVX2-NEXT: vmovaps %ymm1, 96(%rcx) 9723; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9724; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 9725; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9726; AVX2-NEXT: vmovaps %ymm1, (%r8) 9727; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9728; AVX2-NEXT: vmovaps %ymm1, 64(%r8) 9729; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9730; AVX2-NEXT: vmovaps %ymm1, 128(%r8) 9731; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9732; AVX2-NEXT: vmovaps %ymm1, 192(%r8) 9733; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9734; AVX2-NEXT: vmovaps %ymm1, 224(%r8) 9735; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9736; AVX2-NEXT: vmovaps %ymm1, 160(%r8) 9737; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9738; AVX2-NEXT: vmovaps %ymm1, 96(%r8) 9739; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9740; AVX2-NEXT: vmovaps %ymm1, 32(%r8) 9741; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9742; AVX2-NEXT: vmovaps %ymm1, 224(%r9) 9743; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9744; AVX2-NEXT: vmovaps %ymm1, 192(%r9) 9745; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9746; AVX2-NEXT: vmovaps %ymm1, 160(%r9) 9747; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9748; AVX2-NEXT: vmovaps %ymm1, 128(%r9) 9749; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9750; AVX2-NEXT: vmovaps %ymm1, 96(%r9) 9751; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9752; AVX2-NEXT: vmovaps %ymm1, 64(%r9) 9753; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9754; AVX2-NEXT: vmovaps %ymm1, 32(%r9) 9755; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9756; AVX2-NEXT: vmovaps %ymm1, (%r9) 9757; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 9758; AVX2-NEXT: vmovaps %ymm0, 224(%rax) 9759; AVX2-NEXT: vmovaps %ymm8, 192(%rax) 9760; AVX2-NEXT: vmovaps %ymm12, 160(%rax) 9761; AVX2-NEXT: vmovaps %ymm6, 128(%rax) 9762; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9763; AVX2-NEXT: vmovaps %ymm0, 96(%rax) 9764; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9765; AVX2-NEXT: vmovaps %ymm0, 64(%rax) 9766; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9767; AVX2-NEXT: vmovaps %ymm0, 32(%rax) 9768; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9769; AVX2-NEXT: vmovaps %ymm0, (%rax) 9770; AVX2-NEXT: addq $2568, %rsp # imm = 0xA08 9771; AVX2-NEXT: vzeroupper 9772; AVX2-NEXT: retq 9773; 9774; AVX2-FP-LABEL: load_i32_stride6_vf64: 9775; AVX2-FP: # %bb.0: 9776; AVX2-FP-NEXT: subq $2568, %rsp # imm = 0xA08 9777; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm4 9778; AVX2-FP-NEXT: vmovaps 640(%rdi), %ymm5 9779; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9780; AVX2-FP-NEXT: vmovaps 608(%rdi), %ymm3 9781; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9782; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm6 9783; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9784; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm7 9785; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9786; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm2 9787; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9788; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm8 9789; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9790; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm0 9791; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9792; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm1 9793; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9794; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm9 = [0,6,4,u] 9795; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 9796; AVX2-FP-NEXT: vpermps %ymm14, %ymm9, %ymm0 9797; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[0,1],ymm2[0,1] 9798; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5],ymm2[6,7] 9799; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,2,2,2,4,6,6,6] 9800; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 9801; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] 9802; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9803; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2] 9804; AVX2-FP-NEXT: vpermps %ymm1, %ymm6, %ymm2 9805; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 9806; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9807; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9808; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[0,1],ymm4[0,1] 9809; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm4[6,7] 9810; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9811; AVX2-FP-NEXT: vmovaps 576(%rdi), %ymm0 9812; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9813; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] 9814; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9815; AVX2-FP-NEXT: vpermps %ymm0, %ymm9, %ymm0 9816; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2,2,2,4,6,6,6] 9817; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 9818; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm1 9819; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9820; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm2 9821; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9822; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 9823; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9824; AVX2-FP-NEXT: vpermps %ymm1, %ymm6, %ymm2 9825; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 9826; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9827; AVX2-FP-NEXT: vmovaps 1056(%rdi), %ymm1 9828; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9829; AVX2-FP-NEXT: vmovaps 1024(%rdi), %ymm0 9830; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9831; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 9832; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] 9833; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9834; AVX2-FP-NEXT: vmovaps 992(%rdi), %ymm0 9835; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9836; AVX2-FP-NEXT: vmovaps 960(%rdi), %ymm1 9837; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9838; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 9839; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9840; AVX2-FP-NEXT: vpermps %ymm0, %ymm9, %ymm0 9841; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6] 9842; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 9843; AVX2-FP-NEXT: vmovaps 1088(%rdi), %ymm1 9844; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9845; AVX2-FP-NEXT: vmovaps 1120(%rdi), %ymm2 9846; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9847; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 9848; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9849; AVX2-FP-NEXT: vpermps %ymm1, %ymm6, %ymm2 9850; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 9851; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9852; AVX2-FP-NEXT: vmovaps 1440(%rdi), %ymm1 9853; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9854; AVX2-FP-NEXT: vmovaps 1408(%rdi), %ymm0 9855; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9856; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 9857; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] 9858; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9859; AVX2-FP-NEXT: vmovaps 1376(%rdi), %ymm0 9860; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9861; AVX2-FP-NEXT: vmovaps 1344(%rdi), %ymm1 9862; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9863; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 9864; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9865; AVX2-FP-NEXT: vpermps %ymm0, %ymm9, %ymm0 9866; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6] 9867; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 9868; AVX2-FP-NEXT: vmovaps 1472(%rdi), %ymm1 9869; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9870; AVX2-FP-NEXT: vmovaps 1504(%rdi), %ymm2 9871; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9872; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 9873; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9874; AVX2-FP-NEXT: vpermps %ymm1, %ymm6, %ymm2 9875; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 9876; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9877; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm1 9878; AVX2-FP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill 9879; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm0 9880; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9881; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 9882; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] 9883; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9884; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 9885; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9886; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1 9887; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9888; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 9889; AVX2-FP-NEXT: vpermps %ymm13, %ymm9, %ymm0 9890; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6] 9891; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 9892; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm1 9893; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9894; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm2 9895; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9896; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm1[4,5,6,7] 9897; AVX2-FP-NEXT: vpermps %ymm12, %ymm6, %ymm2 9898; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 9899; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9900; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm1 9901; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9902; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm0 9903; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9904; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 9905; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm1[6,7] 9906; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm0 9907; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9908; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm1 9909; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9910; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 9911; AVX2-FP-NEXT: vpermps %ymm10, %ymm9, %ymm0 9912; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,2,2,2,4,6,6,6] 9913; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 9914; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm1 9915; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9916; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm2 9917; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9918; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] 9919; AVX2-FP-NEXT: vpermps %ymm8, %ymm6, %ymm2 9920; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 9921; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9922; AVX2-FP-NEXT: vmovaps 864(%rdi), %ymm1 9923; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9924; AVX2-FP-NEXT: vmovaps 832(%rdi), %ymm0 9925; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9926; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 9927; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7] 9928; AVX2-FP-NEXT: vmovaps 800(%rdi), %ymm0 9929; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9930; AVX2-FP-NEXT: vmovaps 768(%rdi), %ymm1 9931; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9932; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 9933; AVX2-FP-NEXT: vpermps %ymm4, %ymm9, %ymm0 9934; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2,2,2,4,6,6,6] 9935; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 9936; AVX2-FP-NEXT: vmovaps 896(%rdi), %ymm1 9937; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9938; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm2 9939; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9940; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] 9941; AVX2-FP-NEXT: vpermps %ymm3, %ymm6, %ymm2 9942; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 9943; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9944; AVX2-FP-NEXT: vmovaps 1184(%rdi), %ymm0 9945; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9946; AVX2-FP-NEXT: vmovaps 1152(%rdi), %ymm1 9947; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9948; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 9949; AVX2-FP-NEXT: vpermps %ymm2, %ymm9, %ymm0 9950; AVX2-FP-NEXT: vmovaps 1248(%rdi), %ymm1 9951; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9952; AVX2-FP-NEXT: vmovaps 1216(%rdi), %ymm5 9953; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9954; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[0,1],ymm1[0,1] 9955; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7] 9956; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm9[0,2,2,2,4,6,6,6] 9957; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] 9958; AVX2-FP-NEXT: vmovaps 1280(%rdi), %ymm1 9959; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9960; AVX2-FP-NEXT: vmovaps 1312(%rdi), %ymm5 9961; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9962; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7] 9963; AVX2-FP-NEXT: vpermps %ymm5, %ymm6, %ymm1 9964; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 9965; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9966; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm0 = [1,7,5,u] 9967; AVX2-FP-NEXT: vpermps %ymm14, %ymm0, %ymm1 9968; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,3,2,3,5,7,6,7] 9969; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2],ymm15[3,4,5,6,7] 9970; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5,3,5,3,5,3,5,3] 9971; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload 9972; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] 9973; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9974; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 9975; AVX2-FP-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 9976; AVX2-FP-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] 9977; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] 9978; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload 9979; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 9980; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9981; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 9982; AVX2-FP-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 9983; AVX2-FP-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] 9984; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] 9985; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload 9986; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 9987; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9988; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 9989; AVX2-FP-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 9990; AVX2-FP-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] 9991; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] 9992; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload 9993; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 9994; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9995; AVX2-FP-NEXT: vpermps %ymm13, %ymm0, %ymm13 9996; AVX2-FP-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload 9997; AVX2-FP-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] 9998; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] 9999; AVX2-FP-NEXT: vpermps %ymm12, %ymm1, %ymm12 10000; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] 10001; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10002; AVX2-FP-NEXT: vpermps %ymm10, %ymm0, %ymm10 10003; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] 10004; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] 10005; AVX2-FP-NEXT: vpermps %ymm8, %ymm1, %ymm8 10006; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7] 10007; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10008; AVX2-FP-NEXT: vpermps %ymm4, %ymm0, %ymm4 10009; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] 10010; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] 10011; AVX2-FP-NEXT: vpermps %ymm3, %ymm1, %ymm3 10012; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] 10013; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10014; AVX2-FP-NEXT: vpermps %ymm2, %ymm0, %ymm0 10015; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,3,2,3,5,7,6,7] 10016; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 10017; AVX2-FP-NEXT: vpermps %ymm5, %ymm1, %ymm2 10018; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 10019; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10020; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 10021; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 10022; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] 10023; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 10024; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 10025; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10026; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10027; AVX2-FP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] 10028; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10029; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] 10030; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 10031; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 10032; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10033; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10034; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] 10035; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10036; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] 10037; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 10038; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10039; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10040; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 10041; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10042; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] 10043; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 10044; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 10045; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10046; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10047; AVX2-FP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] 10048; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10049; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] 10050; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 10051; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 10052; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10053; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10054; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] 10055; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10056; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] 10057; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 10058; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10059; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10060; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 10061; AVX2-FP-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload 10062; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] 10063; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 10064; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 10065; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10066; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10067; AVX2-FP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] 10068; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10069; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] 10070; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 10071; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 10072; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10073; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10074; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] 10075; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10076; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] 10077; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 10078; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10079; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10080; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10081; AVX2-FP-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10082; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] 10083; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 10084; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 10085; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10086; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10087; AVX2-FP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] 10088; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10089; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] 10090; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 10091; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 10092; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10093; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10094; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] 10095; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10096; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] 10097; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 10098; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10099; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10100; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 10101; AVX2-FP-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10102; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] 10103; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 10104; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 10105; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10106; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10107; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 10108; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10109; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] 10110; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 10111; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 10112; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10113; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10114; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] 10115; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10116; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] 10117; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 10118; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10119; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10120; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10121; AVX2-FP-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10122; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] 10123; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 10124; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,2,0,3] 10125; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10126; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10127; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 10128; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10129; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0,2,3,6,4,6,7] 10130; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] 10131; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] 10132; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10133; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10134; AVX2-FP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] 10135; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10136; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0,2,0,4,4,6,4] 10137; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] 10138; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] 10139; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10140; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 10141; AVX2-FP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload 10142; AVX2-FP-NEXT: # ymm2 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] 10143; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] 10144; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] 10145; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10146; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 10147; AVX2-FP-NEXT: # ymm14 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 10148; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm14[2,0,2,3,6,4,6,7] 10149; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] 10150; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] 10151; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10152; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload 10153; AVX2-FP-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] 10154; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,2,0,4,4,6,4] 10155; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] 10156; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] 10157; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10158; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 10159; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 10160; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] 10161; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] 10162; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,2,0,3] 10163; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10164; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 10165; AVX2-FP-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] 10166; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,0,2,3,6,4,6,7] 10167; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] 10168; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] 10169; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10170; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 10171; AVX2-FP-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] 10172; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,0,2,0,4,4,6,4] 10173; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] 10174; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] 10175; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10176; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] 10177; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] 10178; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 10179; AVX2-FP-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 10180; AVX2-FP-NEXT: # ymm3 = mem[3,1,3,3,7,5,7,7] 10181; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] 10182; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] 10183; AVX2-FP-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 10184; AVX2-FP-NEXT: # ymm3 = mem[0,1,3,1,4,5,7,5] 10185; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 10186; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] 10187; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10188; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7] 10189; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] 10190; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 10191; AVX2-FP-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 10192; AVX2-FP-NEXT: # ymm3 = mem[3,1,3,3,7,5,7,7] 10193; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] 10194; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] 10195; AVX2-FP-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 10196; AVX2-FP-NEXT: # ymm3 = mem[0,1,3,1,4,5,7,5] 10197; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 10198; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] 10199; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10200; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7] 10201; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 10202; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] 10203; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 10204; AVX2-FP-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 10205; AVX2-FP-NEXT: # ymm3 = mem[3,1,3,3,7,5,7,7] 10206; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] 10207; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] 10208; AVX2-FP-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 10209; AVX2-FP-NEXT: # ymm3 = mem[0,1,3,1,4,5,7,5] 10210; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 10211; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] 10212; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10213; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10214; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7] 10215; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 10216; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] 10217; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 10218; AVX2-FP-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 10219; AVX2-FP-NEXT: # ymm3 = mem[3,1,3,3,7,5,7,7] 10220; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] 10221; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] 10222; AVX2-FP-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 10223; AVX2-FP-NEXT: # ymm3 = mem[0,1,3,1,4,5,7,5] 10224; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 10225; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] 10226; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10227; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7] 10228; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] 10229; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 10230; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1,3,3,7,5,7,7] 10231; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 10232; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 10233; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,1,4,5,7,5] 10234; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 10235; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10236; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10237; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 10238; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] 10239; AVX2-FP-NEXT: vmovaps %ymm13, %ymm5 10240; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] 10241; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 10242; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,1,3,3,7,5,7,7] 10243; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 10244; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 10245; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,1,4,5,7,5] 10246; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 10247; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10248; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10249; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 10250; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,3,3,3,7,7,7,7] 10251; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 10252; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] 10253; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 10254; AVX2-FP-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 10255; AVX2-FP-NEXT: # ymm2 = mem[3,1,3,3,7,5,7,7] 10256; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 10257; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 10258; AVX2-FP-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 10259; AVX2-FP-NEXT: # ymm2 = mem[0,1,3,1,4,5,7,5] 10260; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 10261; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10262; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10263; AVX2-FP-NEXT: vmovups (%rsp), %ymm14 # 32-byte Reload 10264; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,3,3,3,7,7,7,7] 10265; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 10266; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] 10267; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] 10268; AVX2-FP-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 10269; AVX2-FP-NEXT: # ymm2 = mem[3,1,3,3,7,5,7,7] 10270; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] 10271; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 10272; AVX2-FP-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 10273; AVX2-FP-NEXT: # ymm2 = mem[0,1,3,1,4,5,7,5] 10274; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 10275; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10276; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10277; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm14[4,5,6,7] 10278; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10279; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10280; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 10281; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3],ymm0[4,5,6,7] 10282; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10283; AVX2-FP-NEXT: vmovaps 80(%rdi), %xmm0 10284; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10285; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 10286; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 10287; AVX2-FP-NEXT: vpermps %ymm3, %ymm6, %ymm2 10288; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 10289; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10290; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10291; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 10292; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10293; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6] 10294; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] 10295; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2 10296; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10297; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10298; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10299; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 10300; AVX2-FP-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] 10301; AVX2-FP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill 10302; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10303; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 10304; AVX2-FP-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] 10305; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10306; AVX2-FP-NEXT: vmovaps 272(%rdi), %xmm0 10307; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10308; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 10309; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 10310; AVX2-FP-NEXT: vpermps %ymm14, %ymm6, %ymm2 10311; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 10312; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10313; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10314; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 10315; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10316; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2 10317; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10318; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10319; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7] 10320; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10321; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10322; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload 10323; AVX2-FP-NEXT: # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7] 10324; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10325; AVX2-FP-NEXT: vmovaps 464(%rdi), %xmm0 10326; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10327; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 10328; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 10329; AVX2-FP-NEXT: vpermps %ymm12, %ymm6, %ymm2 10330; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 10331; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10332; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10333; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 10334; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10335; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2 10336; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10337; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10338; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10339; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 10340; AVX2-FP-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] 10341; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10342; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10343; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload 10344; AVX2-FP-NEXT: # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7] 10345; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10346; AVX2-FP-NEXT: vmovaps 656(%rdi), %xmm0 10347; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10348; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 10349; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 10350; AVX2-FP-NEXT: vpermps %ymm12, %ymm6, %ymm2 10351; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 10352; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10353; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10354; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 10355; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10356; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2 10357; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10358; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10359; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm11[4,5,6,7] 10360; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10361; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10362; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 10363; AVX2-FP-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] 10364; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10365; AVX2-FP-NEXT: vmovaps 848(%rdi), %xmm0 10366; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10367; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 10368; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 10369; AVX2-FP-NEXT: vpermps %ymm5, %ymm6, %ymm2 10370; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 10371; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10372; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10373; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 10374; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10375; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2 10376; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10377; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10378; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload 10379; AVX2-FP-NEXT: # ymm15 = ymm9[0,1,2,3],mem[4,5,6,7] 10380; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10381; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 10382; AVX2-FP-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] 10383; AVX2-FP-NEXT: vmovaps 1040(%rdi), %xmm13 10384; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] 10385; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 10386; AVX2-FP-NEXT: vpermps %ymm14, %ymm6, %ymm2 10387; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 10388; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10389; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload 10390; AVX2-FP-NEXT: # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 10391; AVX2-FP-NEXT: vpermps %ymm12, %ymm3, %ymm2 10392; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10393; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10394; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1,2,3],ymm10[4,5,6,7] 10395; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10396; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload 10397; AVX2-FP-NEXT: # ymm10 = ymm0[0,1,2,3],mem[4,5,6,7] 10398; AVX2-FP-NEXT: vmovaps 1232(%rdi), %xmm9 10399; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm9[2,3],ymm11[4,5,6,7] 10400; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 10401; AVX2-FP-NEXT: vpermps %ymm10, %ymm6, %ymm2 10402; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 10403; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10404; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload 10405; AVX2-FP-NEXT: # ymm8 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 10406; AVX2-FP-NEXT: vpermps %ymm8, %ymm3, %ymm2 10407; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10408; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10409; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7] 10410; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10411; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 10412; AVX2-FP-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] 10413; AVX2-FP-NEXT: vmovaps 1424(%rdi), %xmm4 10414; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] 10415; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 10416; AVX2-FP-NEXT: vpermps %ymm5, %ymm6, %ymm2 10417; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 10418; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10419; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10420; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 10421; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm3 10422; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] 10423; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10424; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 10425; AVX2-FP-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] 10426; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10427; AVX2-FP-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] 10428; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload 10429; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3,4,5,6,7] 10430; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] 10431; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1] 10432; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 10433; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] 10434; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10435; AVX2-FP-NEXT: vpermilps $85, (%rsp), %ymm3 # 32-byte Folded Reload 10436; AVX2-FP-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] 10437; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 10438; AVX2-FP-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] 10439; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 10440; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] 10441; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 10442; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] 10443; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10444; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 10445; AVX2-FP-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] 10446; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 10447; AVX2-FP-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] 10448; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 10449; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] 10450; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 10451; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] 10452; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10453; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 10454; AVX2-FP-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] 10455; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 10456; AVX2-FP-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] 10457; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 10458; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] 10459; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 10460; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] 10461; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10462; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 10463; AVX2-FP-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] 10464; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 10465; AVX2-FP-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] 10466; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload 10467; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7] 10468; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 10469; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7] 10470; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm15[1,1,1,1,5,5,5,5] 10471; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6,7] 10472; AVX2-FP-NEXT: vpermps %ymm14, %ymm1, %ymm13 10473; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3,4,5,6,7] 10474; AVX2-FP-NEXT: vpermps %ymm12, %ymm0, %ymm12 10475; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4],ymm12[5,6,7] 10476; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,1,1,1,5,5,5,5] 10477; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5,6,7] 10478; AVX2-FP-NEXT: vpermps %ymm10, %ymm1, %ymm9 10479; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5,6,7] 10480; AVX2-FP-NEXT: vpermps %ymm8, %ymm0, %ymm8 10481; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm8[5,6,7] 10482; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,1,1,1,5,5,5,5] 10483; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] 10484; AVX2-FP-NEXT: vpermps %ymm5, %ymm1, %ymm1 10485; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] 10486; AVX2-FP-NEXT: vpermps %ymm2, %ymm0, %ymm0 10487; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] 10488; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10489; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rsi) 10490; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10491; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rsi) 10492; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10493; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rsi) 10494; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10495; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi) 10496; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10497; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rsi) 10498; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10499; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rsi) 10500; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10501; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rsi) 10502; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10503; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi) 10504; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10505; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rdx) 10506; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10507; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rdx) 10508; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10509; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rdx) 10510; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10511; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx) 10512; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10513; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rdx) 10514; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10515; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rdx) 10516; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10517; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rdx) 10518; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10519; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx) 10520; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10521; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rcx) 10522; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10523; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rcx) 10524; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10525; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx) 10526; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10527; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx) 10528; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10529; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rcx) 10530; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10531; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rcx) 10532; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10533; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx) 10534; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10535; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx) 10536; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10537; AVX2-FP-NEXT: vmovaps %ymm1, (%r8) 10538; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10539; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r8) 10540; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10541; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r8) 10542; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10543; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r8) 10544; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10545; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r8) 10546; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10547; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r8) 10548; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10549; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r8) 10550; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10551; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8) 10552; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10553; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r9) 10554; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10555; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r9) 10556; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10557; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r9) 10558; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10559; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r9) 10560; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10561; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r9) 10562; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10563; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r9) 10564; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10565; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r9) 10566; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10567; AVX2-FP-NEXT: vmovaps %ymm1, (%r9) 10568; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 10569; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax) 10570; AVX2-FP-NEXT: vmovaps %ymm8, 192(%rax) 10571; AVX2-FP-NEXT: vmovaps %ymm12, 160(%rax) 10572; AVX2-FP-NEXT: vmovaps %ymm6, 128(%rax) 10573; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10574; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rax) 10575; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10576; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax) 10577; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10578; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) 10579; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10580; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) 10581; AVX2-FP-NEXT: addq $2568, %rsp # imm = 0xA08 10582; AVX2-FP-NEXT: vzeroupper 10583; AVX2-FP-NEXT: retq 10584; 10585; AVX2-FCP-LABEL: load_i32_stride6_vf64: 10586; AVX2-FCP: # %bb.0: 10587; AVX2-FCP-NEXT: subq $2536, %rsp # imm = 0x9E8 10588; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm5 10589; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm6 10590; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10591; AVX2-FCP-NEXT: vmovaps 608(%rdi), %ymm3 10592; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10593; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm4 10594; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10595; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm7 10596; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10597; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm2 10598; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10599; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm8 10600; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10601; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm0 10602; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10603; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm1 10604; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10605; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm12 = [0,6,4,u] 10606; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 10607; AVX2-FCP-NEXT: vpermps %ymm15, %ymm12, %ymm0 10608; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[0,1],ymm2[0,1] 10609; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] 10610; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10611; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2,2,2,4,6,6,6] 10612; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 10613; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm4[4,5,6,7] 10614; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10615; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] 10616; AVX2-FCP-NEXT: vpermps %ymm2, %ymm4, %ymm2 10617; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 10618; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10619; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10620; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[0,1],ymm5[0,1] 10621; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm5[6,7] 10622; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10623; AVX2-FCP-NEXT: vmovaps 576(%rdi), %ymm0 10624; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10625; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] 10626; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10627; AVX2-FCP-NEXT: vpermps %ymm0, %ymm12, %ymm0 10628; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2,2,2,4,6,6,6] 10629; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 10630; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm1 10631; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10632; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm2 10633; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10634; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 10635; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10636; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm2 10637; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 10638; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10639; AVX2-FCP-NEXT: vmovaps 1056(%rdi), %ymm1 10640; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10641; AVX2-FCP-NEXT: vmovaps 1024(%rdi), %ymm0 10642; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10643; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 10644; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] 10645; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10646; AVX2-FCP-NEXT: vmovaps 992(%rdi), %ymm0 10647; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10648; AVX2-FCP-NEXT: vmovaps 960(%rdi), %ymm1 10649; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10650; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 10651; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10652; AVX2-FCP-NEXT: vpermps %ymm0, %ymm12, %ymm0 10653; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6] 10654; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 10655; AVX2-FCP-NEXT: vmovaps 1088(%rdi), %ymm1 10656; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10657; AVX2-FCP-NEXT: vmovaps 1120(%rdi), %ymm2 10658; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10659; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 10660; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10661; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm2 10662; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 10663; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10664; AVX2-FCP-NEXT: vmovaps 1440(%rdi), %ymm1 10665; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10666; AVX2-FCP-NEXT: vmovaps 1408(%rdi), %ymm0 10667; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10668; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 10669; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] 10670; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10671; AVX2-FCP-NEXT: vmovaps 1376(%rdi), %ymm0 10672; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10673; AVX2-FCP-NEXT: vmovaps 1344(%rdi), %ymm1 10674; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10675; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 10676; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10677; AVX2-FCP-NEXT: vpermps %ymm0, %ymm12, %ymm0 10678; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6] 10679; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 10680; AVX2-FCP-NEXT: vmovaps 1472(%rdi), %ymm1 10681; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10682; AVX2-FCP-NEXT: vmovaps 1504(%rdi), %ymm2 10683; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10684; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 10685; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10686; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm2 10687; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 10688; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10689; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1 10690; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10691; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm0 10692; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10693; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 10694; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] 10695; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10696; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 10697; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10698; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 10699; AVX2-FCP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill 10700; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 10701; AVX2-FCP-NEXT: vpermps %ymm13, %ymm12, %ymm0 10702; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6] 10703; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 10704; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm1 10705; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10706; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm2 10707; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10708; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5,6,7] 10709; AVX2-FCP-NEXT: vpermps %ymm11, %ymm4, %ymm2 10710; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 10711; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10712; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm1 10713; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10714; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm0 10715; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10716; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 10717; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm1[6,7] 10718; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm0 10719; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10720; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm1 10721; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10722; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 10723; AVX2-FCP-NEXT: vpermps %ymm9, %ymm12, %ymm0 10724; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,2,2,2,4,6,6,6] 10725; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 10726; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm1 10727; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10728; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm2 10729; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10730; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] 10731; AVX2-FCP-NEXT: vpermps %ymm8, %ymm4, %ymm2 10732; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10733; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 10734; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10735; AVX2-FCP-NEXT: vmovaps 864(%rdi), %ymm1 10736; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10737; AVX2-FCP-NEXT: vmovaps 832(%rdi), %ymm0 10738; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10739; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 10740; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7] 10741; AVX2-FCP-NEXT: vmovaps 800(%rdi), %ymm0 10742; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10743; AVX2-FCP-NEXT: vmovaps 768(%rdi), %ymm1 10744; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10745; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 10746; AVX2-FCP-NEXT: vpermps %ymm6, %ymm12, %ymm0 10747; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2,2,2,4,6,6,6] 10748; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 10749; AVX2-FCP-NEXT: vmovaps 896(%rdi), %ymm1 10750; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10751; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm2 10752; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10753; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7] 10754; AVX2-FCP-NEXT: vpermps %ymm5, %ymm4, %ymm2 10755; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 10756; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10757; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %ymm0 10758; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10759; AVX2-FCP-NEXT: vmovaps 1152(%rdi), %ymm1 10760; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10761; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 10762; AVX2-FCP-NEXT: vpermps %ymm3, %ymm12, %ymm0 10763; AVX2-FCP-NEXT: vmovaps 1248(%rdi), %ymm1 10764; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10765; AVX2-FCP-NEXT: vmovaps 1216(%rdi), %ymm2 10766; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10767; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm1[0,1] 10768; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3,4,5],ymm1[6,7] 10769; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,2,2,2,4,6,6,6] 10770; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 10771; AVX2-FCP-NEXT: vmovaps 1280(%rdi), %ymm1 10772; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10773; AVX2-FCP-NEXT: vmovaps 1312(%rdi), %ymm2 10774; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10775; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] 10776; AVX2-FCP-NEXT: vpermps %ymm2, %ymm4, %ymm1 10777; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 10778; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10779; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [1,7,5,u] 10780; AVX2-FCP-NEXT: vpermps %ymm15, %ymm0, %ymm1 10781; AVX2-FCP-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 10782; AVX2-FCP-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] 10783; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2],ymm15[3,4,5,6,7] 10784; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5,3,5,3,5,3,5,3] 10785; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload 10786; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] 10787; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10788; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 10789; AVX2-FCP-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 10790; AVX2-FCP-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] 10791; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] 10792; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload 10793; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 10794; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10795; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 10796; AVX2-FCP-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 10797; AVX2-FCP-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] 10798; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] 10799; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload 10800; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] 10801; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10802; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 10803; AVX2-FCP-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload 10804; AVX2-FCP-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] 10805; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] 10806; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload 10807; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm15[6,7] 10808; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10809; AVX2-FCP-NEXT: vpermps %ymm13, %ymm0, %ymm13 10810; AVX2-FCP-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload 10811; AVX2-FCP-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] 10812; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] 10813; AVX2-FCP-NEXT: vpermps %ymm11, %ymm1, %ymm11 10814; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5],ymm11[6,7] 10815; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10816; AVX2-FCP-NEXT: vpermps %ymm9, %ymm0, %ymm9 10817; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] 10818; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] 10819; AVX2-FCP-NEXT: vpermps %ymm8, %ymm1, %ymm8 10820; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm8[6,7] 10821; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10822; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm6 10823; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] 10824; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] 10825; AVX2-FCP-NEXT: vpermps %ymm5, %ymm1, %ymm5 10826; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm5[6,7] 10827; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10828; AVX2-FCP-NEXT: vpermps %ymm3, %ymm0, %ymm0 10829; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm12[1,3,2,3,5,7,6,7] 10830; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] 10831; AVX2-FCP-NEXT: vpermps %ymm2, %ymm1, %ymm2 10832; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 10833; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10834; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10835; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload 10836; AVX2-FCP-NEXT: # ymm0 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7] 10837; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm13 = [2,0,6,4,2,0,6,7] 10838; AVX2-FCP-NEXT: vpermps %ymm0, %ymm13, %ymm0 10839; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm12 = [2,0,6,7] 10840; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10841; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10842; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] 10843; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10844; AVX2-FCP-NEXT: vpermps %ymm2, %ymm12, %ymm2 10845; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 10846; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10847; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10848; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] 10849; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10850; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm15 = [0,0,6,4,0,0,6,4] 10851; AVX2-FCP-NEXT: # ymm15 = mem[0,1,0,1] 10852; AVX2-FCP-NEXT: vpermps %ymm2, %ymm15, %ymm2 10853; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10854; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10855; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 10856; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload 10857; AVX2-FCP-NEXT: # ymm0 = ymm6[0,1],mem[2,3],ymm6[4,5],mem[6,7] 10858; AVX2-FCP-NEXT: vpermps %ymm0, %ymm13, %ymm0 10859; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10860; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10861; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] 10862; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10863; AVX2-FCP-NEXT: vpermps %ymm2, %ymm12, %ymm2 10864; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 10865; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10866; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10867; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] 10868; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10869; AVX2-FCP-NEXT: vpermps %ymm2, %ymm15, %ymm2 10870; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10871; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10872; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10873; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10874; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] 10875; AVX2-FCP-NEXT: vpermps %ymm0, %ymm13, %ymm0 10876; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10877; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10878; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] 10879; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10880; AVX2-FCP-NEXT: vpermps %ymm2, %ymm12, %ymm2 10881; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 10882; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10883; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload 10884; AVX2-FCP-NEXT: # ymm8 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] 10885; AVX2-FCP-NEXT: vpermps %ymm8, %ymm15, %ymm2 10886; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10887; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10888; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 10889; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload 10890; AVX2-FCP-NEXT: # ymm0 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] 10891; AVX2-FCP-NEXT: vpermps %ymm0, %ymm13, %ymm0 10892; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10893; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload 10894; AVX2-FCP-NEXT: # ymm14 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] 10895; AVX2-FCP-NEXT: vpermps %ymm14, %ymm12, %ymm2 10896; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 10897; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10898; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload 10899; AVX2-FCP-NEXT: # ymm11 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] 10900; AVX2-FCP-NEXT: vpermps %ymm11, %ymm15, %ymm2 10901; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10902; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10903; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10904; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10905; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] 10906; AVX2-FCP-NEXT: vpermps %ymm0, %ymm13, %ymm0 10907; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10908; AVX2-FCP-NEXT: vblendps $243, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload 10909; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 10910; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10911; AVX2-FCP-NEXT: vpermps %ymm2, %ymm12, %ymm2 10912; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 10913; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10914; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10915; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] 10916; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10917; AVX2-FCP-NEXT: vpermps %ymm2, %ymm15, %ymm2 10918; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10919; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10920; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10921; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10922; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] 10923; AVX2-FCP-NEXT: vpermps %ymm0, %ymm13, %ymm0 10924; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10925; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10926; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] 10927; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10928; AVX2-FCP-NEXT: vpermps %ymm2, %ymm12, %ymm2 10929; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 10930; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10931; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10932; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] 10933; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10934; AVX2-FCP-NEXT: vpermps %ymm2, %ymm15, %ymm2 10935; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 10936; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10937; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 10938; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload 10939; AVX2-FCP-NEXT: # ymm0 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7] 10940; AVX2-FCP-NEXT: vpermps %ymm0, %ymm13, %ymm0 10941; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10942; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload 10943; AVX2-FCP-NEXT: # ymm9 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] 10944; AVX2-FCP-NEXT: vpermps %ymm9, %ymm12, %ymm2 10945; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 10946; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10947; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10948; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] 10949; AVX2-FCP-NEXT: vpermps %ymm2, %ymm15, %ymm5 10950; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] 10951; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10952; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 10953; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload 10954; AVX2-FCP-NEXT: # ymm0 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] 10955; AVX2-FCP-NEXT: vpermps %ymm0, %ymm13, %ymm0 10956; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 10957; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload 10958; AVX2-FCP-NEXT: # ymm13 = ymm5[0,1],mem[2,3],ymm5[4,5,6,7] 10959; AVX2-FCP-NEXT: vpermps %ymm13, %ymm12, %ymm5 10960; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm0[3,4,5,6,7] 10961; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10962; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10963; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] 10964; AVX2-FCP-NEXT: vpermps %ymm0, %ymm15, %ymm12 10965; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm12[5,6,7] 10966; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10967; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm5 = mem[3,3,3,3] 10968; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7] 10969; AVX2-FCP-NEXT: vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload 10970; AVX2-FCP-NEXT: # ymm12 = ymm5[0],mem[1,2,3,4],ymm5[5],mem[6,7] 10971; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm5 = [3,1,7,5,0,u,u,u] 10972; AVX2-FCP-NEXT: vpermps %ymm12, %ymm5, %ymm15 10973; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm12 = [0,1,7,5,0,1,7,5] 10974; AVX2-FCP-NEXT: # ymm12 = mem[0,1,0,1] 10975; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload 10976; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1,2,3,4],ymm7[5,6,7] 10977; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10978; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm7 = mem[3,3,3,3] 10979; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] 10980; AVX2-FCP-NEXT: vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload 10981; AVX2-FCP-NEXT: # ymm7 = ymm7[0],mem[1,2,3,4],ymm7[5],mem[6,7] 10982; AVX2-FCP-NEXT: vpermps %ymm7, %ymm5, %ymm7 10983; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm15 # 32-byte Folded Reload 10984; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm15[5,6,7] 10985; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10986; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm7 = mem[3,3,3,3] 10987; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 10988; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm15[1],ymm7[2,3,4],ymm15[5],ymm7[6,7] 10989; AVX2-FCP-NEXT: vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload 10990; AVX2-FCP-NEXT: # ymm7 = ymm7[0],mem[1,2,3,4],ymm7[5],mem[6,7] 10991; AVX2-FCP-NEXT: vpermps %ymm7, %ymm5, %ymm7 10992; AVX2-FCP-NEXT: vpermps %ymm8, %ymm12, %ymm8 10993; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm8[5,6,7] 10994; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10995; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm7 = mem[3,3,3,3] 10996; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7] 10997; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm14[1,2,3,4],ymm7[5],ymm14[6,7] 10998; AVX2-FCP-NEXT: vpermps %ymm7, %ymm5, %ymm7 10999; AVX2-FCP-NEXT: vpermps %ymm11, %ymm12, %ymm6 11000; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm6[5,6,7] 11001; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11002; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm6 = mem[3,3,3,3] 11003; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] 11004; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm13[1,2,3,4],ymm6[5],ymm13[6,7] 11005; AVX2-FCP-NEXT: vpermps %ymm6, %ymm5, %ymm6 11006; AVX2-FCP-NEXT: vpermps %ymm0, %ymm12, %ymm0 11007; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7] 11008; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11009; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,3,3] 11010; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] 11011; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1,2,3,4],ymm0[5],ymm9[6,7] 11012; AVX2-FCP-NEXT: vpermps %ymm0, %ymm5, %ymm0 11013; AVX2-FCP-NEXT: vpermps %ymm2, %ymm12, %ymm2 11014; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 11015; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11016; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,3,3] 11017; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 11018; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] 11019; AVX2-FCP-NEXT: vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11020; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1,2,3,4],ymm0[5],mem[6,7] 11021; AVX2-FCP-NEXT: vpermps %ymm0, %ymm5, %ymm0 11022; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload 11023; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 11024; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11025; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload 11026; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm2 = mem[3,3,3,3] 11027; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 11028; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7] 11029; AVX2-FCP-NEXT: vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11030; AVX2-FCP-NEXT: # ymm2 = ymm2[0],mem[1,2,3,4],ymm2[5],mem[6,7] 11031; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2 11032; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] 11033; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11034; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload 11035; AVX2-FCP-NEXT: # ymm2 = ymm4[0,1,2,3],mem[4,5,6,7] 11036; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11037; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11038; AVX2-FCP-NEXT: vblendps $15, (%rsp), %ymm0, %ymm5 # 32-byte Folded Reload 11039; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3],ymm0[4,5,6,7] 11040; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11041; AVX2-FCP-NEXT: vmovaps 80(%rdi), %xmm0 11042; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 11043; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 11044; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 11045; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 11046; AVX2-FCP-NEXT: vpermps %ymm5, %ymm4, %ymm2 11047; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 11048; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11049; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11050; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 11051; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11052; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6] 11053; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] 11054; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2 11055; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 11056; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11057; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11058; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 11059; AVX2-FCP-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] 11060; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11061; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11062; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload 11063; AVX2-FCP-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] 11064; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11065; AVX2-FCP-NEXT: vmovaps 272(%rdi), %xmm0 11066; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11067; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 11068; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 11069; AVX2-FCP-NEXT: vpermps %ymm8, %ymm4, %ymm2 11070; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 11071; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11072; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11073; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 11074; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11075; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2 11076; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 11077; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11078; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload 11079; AVX2-FCP-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7] 11080; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11081; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11082; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload 11083; AVX2-FCP-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] 11084; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11085; AVX2-FCP-NEXT: vmovaps 464(%rdi), %xmm0 11086; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11087; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 11088; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 11089; AVX2-FCP-NEXT: vpermps %ymm7, %ymm4, %ymm2 11090; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 11091; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11092; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11093; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 11094; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11095; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2 11096; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 11097; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11098; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11099; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 11100; AVX2-FCP-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] 11101; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11102; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11103; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload 11104; AVX2-FCP-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] 11105; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11106; AVX2-FCP-NEXT: vmovaps 656(%rdi), %xmm0 11107; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11108; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 11109; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 11110; AVX2-FCP-NEXT: vpermps %ymm7, %ymm4, %ymm2 11111; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 11112; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11113; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11114; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 11115; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11116; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2 11117; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 11118; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11119; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload 11120; AVX2-FCP-NEXT: # ymm2 = ymm10[0,1,2,3],mem[4,5,6,7] 11121; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11122; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11123; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 11124; AVX2-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] 11125; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11126; AVX2-FCP-NEXT: vmovaps 848(%rdi), %xmm0 11127; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11128; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 11129; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 11130; AVX2-FCP-NEXT: vpermps %ymm6, %ymm4, %ymm2 11131; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 11132; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11133; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11134; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 11135; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11136; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2 11137; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 11138; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11139; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload 11140; AVX2-FCP-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] 11141; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11142; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 11143; AVX2-FCP-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] 11144; AVX2-FCP-NEXT: vmovaps 1040(%rdi), %xmm13 11145; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] 11146; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 11147; AVX2-FCP-NEXT: vpermps %ymm14, %ymm4, %ymm2 11148; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 11149; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11150; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload 11151; AVX2-FCP-NEXT: # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 11152; AVX2-FCP-NEXT: vpermps %ymm12, %ymm5, %ymm2 11153; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 11154; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11155; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload 11156; AVX2-FCP-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] 11157; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11158; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload 11159; AVX2-FCP-NEXT: # ymm10 = ymm0[0,1,2,3],mem[4,5,6,7] 11160; AVX2-FCP-NEXT: vmovaps 1232(%rdi), %xmm9 11161; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm9[2,3],ymm11[4,5,6,7] 11162; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 11163; AVX2-FCP-NEXT: vpermps %ymm10, %ymm4, %ymm2 11164; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 11165; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11166; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload 11167; AVX2-FCP-NEXT: # ymm8 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 11168; AVX2-FCP-NEXT: vpermps %ymm8, %ymm5, %ymm2 11169; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 11170; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11171; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11172; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload 11173; AVX2-FCP-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] 11174; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11175; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 11176; AVX2-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] 11177; AVX2-FCP-NEXT: vmovaps 1424(%rdi), %xmm3 11178; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm3[2,3],ymm7[4,5,6,7] 11179; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] 11180; AVX2-FCP-NEXT: vpermps %ymm6, %ymm4, %ymm2 11181; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] 11182; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11183; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11184; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] 11185; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm4 11186; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] 11187; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11188; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 11189; AVX2-FCP-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] 11190; AVX2-FCP-NEXT: vblendps $8, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 11191; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] 11192; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload 11193; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm0[2,3,4,5,6,7] 11194; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] 11195; AVX2-FCP-NEXT: # ymm0 = mem[0,1,0,1] 11196; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 11197; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] 11198; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11199; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 11200; AVX2-FCP-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] 11201; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 11202; AVX2-FCP-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] 11203; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload 11204; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] 11205; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 11206; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] 11207; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11208; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 11209; AVX2-FCP-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] 11210; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 11211; AVX2-FCP-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] 11212; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload 11213; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] 11214; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 11215; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] 11216; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11217; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 11218; AVX2-FCP-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] 11219; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 11220; AVX2-FCP-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] 11221; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload 11222; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] 11223; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 11224; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] 11225; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11226; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 11227; AVX2-FCP-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] 11228; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 11229; AVX2-FCP-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] 11230; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload 11231; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] 11232; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 11233; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] 11234; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm15[1,1,1,1,5,5,5,5] 11235; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7] 11236; AVX2-FCP-NEXT: vpermps %ymm14, %ymm1, %ymm13 11237; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3,4,5,6,7] 11238; AVX2-FCP-NEXT: vpermps %ymm12, %ymm0, %ymm12 11239; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3,4],ymm12[5,6,7] 11240; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm11[1,1,1,1,5,5,5,5] 11241; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3],ymm5[4,5,6,7] 11242; AVX2-FCP-NEXT: vpermps %ymm10, %ymm1, %ymm9 11243; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3,4,5,6,7] 11244; AVX2-FCP-NEXT: vpermps %ymm8, %ymm0, %ymm8 11245; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3,4],ymm8[5,6,7] 11246; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5] 11247; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6,7] 11248; AVX2-FCP-NEXT: vpermps %ymm6, %ymm1, %ymm1 11249; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] 11250; AVX2-FCP-NEXT: vpermps %ymm2, %ymm0, %ymm0 11251; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] 11252; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11253; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rsi) 11254; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11255; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rsi) 11256; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11257; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rsi) 11258; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11259; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi) 11260; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11261; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rsi) 11262; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11263; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rsi) 11264; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11265; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rsi) 11266; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11267; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi) 11268; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11269; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rdx) 11270; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11271; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rdx) 11272; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11273; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rdx) 11274; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11275; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx) 11276; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11277; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rdx) 11278; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11279; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rdx) 11280; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11281; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rdx) 11282; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11283; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx) 11284; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11285; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rcx) 11286; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11287; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rcx) 11288; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11289; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx) 11290; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11291; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx) 11292; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11293; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rcx) 11294; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11295; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rcx) 11296; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11297; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rcx) 11298; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11299; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx) 11300; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11301; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8) 11302; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11303; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r8) 11304; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11305; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r8) 11306; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11307; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r8) 11308; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11309; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r8) 11310; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11311; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r8) 11312; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11313; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r8) 11314; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11315; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8) 11316; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11317; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r9) 11318; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11319; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r9) 11320; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11321; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r9) 11322; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11323; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r9) 11324; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11325; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r9) 11326; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11327; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r9) 11328; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11329; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r9) 11330; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11331; AVX2-FCP-NEXT: vmovaps %ymm1, (%r9) 11332; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 11333; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax) 11334; AVX2-FCP-NEXT: vmovaps %ymm8, 192(%rax) 11335; AVX2-FCP-NEXT: vmovaps %ymm12, 160(%rax) 11336; AVX2-FCP-NEXT: vmovaps %ymm4, 128(%rax) 11337; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11338; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rax) 11339; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11340; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax) 11341; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11342; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) 11343; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11344; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) 11345; AVX2-FCP-NEXT: addq $2536, %rsp # imm = 0x9E8 11346; AVX2-FCP-NEXT: vzeroupper 11347; AVX2-FCP-NEXT: retq 11348; 11349; AVX512-LABEL: load_i32_stride6_vf64: 11350; AVX512: # %bb.0: 11351; AVX512-NEXT: subq $2632, %rsp # imm = 0xA48 11352; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm21 11353; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm1 11354; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm20 11355; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm0 11356; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm2 11357; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm18 11358; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm27 11359; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm3 11360; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm25 11361; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm4 11362; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 11363; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 11364; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 11365; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 11366; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 11367; AVX512-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 11368; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11369; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 11370; AVX512-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 11371; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11372; AVX512-NEXT: vmovdqa64 %zmm18, %zmm7 11373; AVX512-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 11374; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11375; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 11376; AVX512-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 11377; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11378; AVX512-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 11379; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11380; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 11381; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 11382; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 11383; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 11384; AVX512-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 11385; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11386; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 11387; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 11388; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 11389; AVX512-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 11390; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11391; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 11392; AVX512-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 11393; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11394; AVX512-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 11395; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11396; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 11397; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 11398; AVX512-NEXT: vmovdqa64 %zmm20, %zmm7 11399; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 11400; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11401; AVX512-NEXT: vmovdqa64 %zmm27, %zmm7 11402; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 11403; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11404; AVX512-NEXT: vmovdqa64 %zmm25, %zmm7 11405; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 11406; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11407; AVX512-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 11408; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11409; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 11410; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 11411; AVX512-NEXT: vmovdqa64 %zmm20, %zmm7 11412; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 11413; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11414; AVX512-NEXT: vmovdqa64 %zmm27, %zmm7 11415; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 11416; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11417; AVX512-NEXT: vmovdqa64 %zmm25, %zmm7 11418; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 11419; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11420; AVX512-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 11421; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11422; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 11423; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 11424; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 11425; AVX512-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 11426; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11427; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 11428; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 11429; AVX512-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 11430; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11431; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 11432; AVX512-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 11433; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11434; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 11435; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11436; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 11437; AVX512-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 11438; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11439; AVX512-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 11440; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11441; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 11442; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11443; AVX512-NEXT: vmovdqa64 %zmm18, %zmm1 11444; AVX512-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 11445; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11446; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 11447; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11448; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 11449; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 11450; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 11451; AVX512-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 11452; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11453; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 11454; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 11455; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 11456; AVX512-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 11457; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11458; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 11459; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 11460; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 11461; AVX512-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 11462; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11463; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 11464; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 11465; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 11466; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11467; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm26 11468; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm1 11469; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 11470; AVX512-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 11471; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11472; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 11473; AVX512-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 11474; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11475; AVX512-NEXT: vmovdqa64 %zmm26, %zmm2 11476; AVX512-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 11477; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11478; AVX512-NEXT: vmovdqa64 %zmm26, %zmm2 11479; AVX512-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 11480; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11481; AVX512-NEXT: vmovdqa64 %zmm26, %zmm2 11482; AVX512-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 11483; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11484; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 11485; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm22 11486; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm1 11487; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 11488; AVX512-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 11489; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11490; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm19 11491; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm2 11492; AVX512-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 11493; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11494; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 11495; AVX512-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 11496; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11497; AVX512-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 11498; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11499; AVX512-NEXT: vmovdqa64 %zmm22, %zmm28 11500; AVX512-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 11501; AVX512-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 11502; AVX512-NEXT: vmovdqa64 %zmm22, %zmm29 11503; AVX512-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 11504; AVX512-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 11505; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 11506; AVX512-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 11507; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11508; AVX512-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 11509; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11510; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 11511; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 11512; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm2 11513; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm0 11514; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] 11515; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 11516; AVX512-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 11517; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] 11518; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 11519; AVX512-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 11520; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] 11521; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 11522; AVX512-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 11523; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11524; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] 11525; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 11526; AVX512-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 11527; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill 11528; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] 11529; AVX512-NEXT: vmovdqa64 %zmm2, %zmm23 11530; AVX512-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 11531; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] 11532; AVX512-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 11533; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 11534; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm6 11535; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 11536; AVX512-NEXT: vpermt2d %zmm6, %zmm3, %zmm15 11537; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 11538; AVX512-NEXT: vpermt2d %zmm6, %zmm4, %zmm16 11539; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 11540; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm27 11541; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 11542; AVX512-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 11543; AVX512-NEXT: vmovdqa64 %zmm6, %zmm17 11544; AVX512-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 11545; AVX512-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 11546; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm10 11547; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm7 11548; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 11549; AVX512-NEXT: vpermt2d %zmm10, %zmm3, %zmm8 11550; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm1 11551; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm0 11552; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 11553; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 11554; AVX512-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 11555; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 11556; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 11557; AVX512-NEXT: vpermt2d %zmm10, %zmm12, %zmm13 11558; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 11559; AVX512-NEXT: vmovdqa64 %zmm7, %zmm18 11560; AVX512-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 11561; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 11562; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 11563; AVX512-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 11564; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 11565; AVX512-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 11566; AVX512-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 11567; AVX512-NEXT: movb $56, %al 11568; AVX512-NEXT: kmovw %eax, %k2 11569; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11570; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} 11571; AVX512-NEXT: movw $-2048, %ax # imm = 0xF800 11572; AVX512-NEXT: kmovw %eax, %k1 11573; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11574; AVX512-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} 11575; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11576; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} 11577; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11578; AVX512-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} 11579; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11580; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} 11581; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11582; AVX512-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} 11583; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11584; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} 11585; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11586; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 11587; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11588; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} 11589; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11590; AVX512-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} 11591; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11592; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} 11593; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11594; AVX512-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} 11595; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11596; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} 11597; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11598; AVX512-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} 11599; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11600; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} 11601; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11602; AVX512-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} 11603; AVX512-NEXT: movw $31, %ax 11604; AVX512-NEXT: kmovw %eax, %k2 11605; AVX512-NEXT: vmovdqa32 %zmm13, %zmm28 {%k2} 11606; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11607; AVX512-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} 11608; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 11609; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11610; AVX512-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} 11611; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11612; AVX512-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 11613; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 11614; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 11615; AVX512-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} 11616; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11617; AVX512-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 11618; AVX512-NEXT: vmovdqa64 %zmm7, %zmm24 11619; AVX512-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} 11620; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11621; AVX512-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} 11622; AVX512-NEXT: vmovdqa32 %zmm18, %zmm29 {%k2} 11623; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11624; AVX512-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} 11625; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 11626; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload 11627; AVX512-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} 11628; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11629; AVX512-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 11630; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 11631; AVX512-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} 11632; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11633; AVX512-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} 11634; AVX512-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} 11635; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11636; AVX512-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} 11637; AVX512-NEXT: movw $992, %ax # imm = 0x3E0 11638; AVX512-NEXT: kmovw %eax, %k1 11639; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11640; AVX512-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} 11641; AVX512-NEXT: movb $-32, %al 11642; AVX512-NEXT: kmovw %eax, %k2 11643; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11644; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} 11645; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11646; AVX512-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} 11647; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11648; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} 11649; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11650; AVX512-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} 11651; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11652; AVX512-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} 11653; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11654; AVX512-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} 11655; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11656; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} 11657; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11658; AVX512-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 11659; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11660; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} 11661; AVX512-NEXT: vmovdqa32 %zmm26, %zmm6 {%k1} 11662; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11663; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} 11664; AVX512-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} 11665; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11666; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} 11667; AVX512-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} 11668; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11669; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} 11670; AVX512-NEXT: vmovdqa64 %zmm3, 192(%rsi) 11671; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rsi) 11672; AVX512-NEXT: vmovdqa64 %zmm20, 64(%rsi) 11673; AVX512-NEXT: vmovdqa64 %zmm15, (%rsi) 11674; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rdx) 11675; AVX512-NEXT: vmovdqa64 %zmm16, (%rdx) 11676; AVX512-NEXT: vmovdqa64 %zmm21, 64(%rdx) 11677; AVX512-NEXT: vmovdqa64 %zmm9, 128(%rdx) 11678; AVX512-NEXT: vmovdqa64 %zmm31, 192(%rcx) 11679; AVX512-NEXT: vmovdqa64 %zmm24, (%rcx) 11680; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rcx) 11681; AVX512-NEXT: vmovdqa64 %zmm28, 128(%rcx) 11682; AVX512-NEXT: vmovdqa64 %zmm25, 192(%r8) 11683; AVX512-NEXT: vmovdqa64 %zmm12, (%r8) 11684; AVX512-NEXT: vmovdqa64 %zmm7, 64(%r8) 11685; AVX512-NEXT: vmovdqa64 %zmm29, 128(%r8) 11686; AVX512-NEXT: vmovdqa64 %zmm5, 192(%r9) 11687; AVX512-NEXT: vmovdqa64 %zmm17, (%r9) 11688; AVX512-NEXT: vmovdqa64 %zmm23, 64(%r9) 11689; AVX512-NEXT: vmovdqa64 %zmm11, 128(%r9) 11690; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 11691; AVX512-NEXT: vmovdqa64 %zmm10, 128(%rax) 11692; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) 11693; AVX512-NEXT: vmovdqa64 %zmm6, (%rax) 11694; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax) 11695; AVX512-NEXT: addq $2632, %rsp # imm = 0xA48 11696; AVX512-NEXT: vzeroupper 11697; AVX512-NEXT: retq 11698; 11699; AVX512-FCP-LABEL: load_i32_stride6_vf64: 11700; AVX512-FCP: # %bb.0: 11701; AVX512-FCP-NEXT: subq $2632, %rsp # imm = 0xA48 11702; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm21 11703; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm1 11704; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm20 11705; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm0 11706; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 11707; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm18 11708; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm27 11709; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3 11710; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25 11711; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 11712; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 11713; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 11714; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 11715; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 11716; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 11717; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 11718; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11719; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 11720; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 11721; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11722; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 11723; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 11724; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11725; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 11726; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 11727; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11728; AVX512-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 11729; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11730; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 11731; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 11732; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 11733; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 11734; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 11735; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11736; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 11737; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 11738; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 11739; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 11740; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11741; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 11742; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 11743; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11744; AVX512-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 11745; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11746; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 11747; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 11748; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 11749; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 11750; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11751; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 11752; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 11753; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11754; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 11755; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 11756; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11757; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 11758; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11759; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 11760; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 11761; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 11762; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 11763; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11764; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 11765; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 11766; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11767; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 11768; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 11769; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11770; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 11771; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11772; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 11773; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 11774; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 11775; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 11776; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11777; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 11778; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 11779; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 11780; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11781; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 11782; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 11783; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11784; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 11785; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11786; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 11787; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 11788; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11789; AVX512-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 11790; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11791; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 11792; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11793; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 11794; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 11795; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11796; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 11797; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11798; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 11799; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 11800; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 11801; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 11802; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11803; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 11804; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 11805; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 11806; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 11807; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11808; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 11809; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 11810; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 11811; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 11812; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11813; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 11814; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 11815; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 11816; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11817; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm26 11818; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 11819; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 11820; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 11821; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11822; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 11823; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 11824; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11825; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 11826; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 11827; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11828; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 11829; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 11830; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11831; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 11832; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 11833; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11834; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 11835; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm22 11836; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm1 11837; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 11838; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 11839; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11840; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm19 11841; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 11842; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 11843; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11844; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 11845; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 11846; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11847; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 11848; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11849; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 11850; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 11851; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 11852; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 11853; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 11854; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 11855; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 11856; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 11857; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11858; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 11859; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11860; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 11861; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 11862; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 11863; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 11864; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] 11865; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 11866; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 11867; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] 11868; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 11869; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 11870; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] 11871; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 11872; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 11873; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11874; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] 11875; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 11876; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 11877; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill 11878; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] 11879; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 11880; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 11881; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] 11882; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 11883; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 11884; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 11885; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 11886; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm3, %zmm15 11887; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 11888; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm4, %zmm16 11889; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 11890; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm27 11891; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 11892; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 11893; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm17 11894; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 11895; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 11896; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 11897; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm7 11898; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 11899; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm3, %zmm8 11900; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm1 11901; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm0 11902; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 11903; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 11904; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 11905; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 11906; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 11907; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm12, %zmm13 11908; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 11909; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 11910; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 11911; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 11912; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 11913; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 11914; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 11915; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 11916; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 11917; AVX512-FCP-NEXT: movb $56, %al 11918; AVX512-FCP-NEXT: kmovw %eax, %k2 11919; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11920; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} 11921; AVX512-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 11922; AVX512-FCP-NEXT: kmovw %eax, %k1 11923; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11924; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} 11925; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11926; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} 11927; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11928; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} 11929; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11930; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} 11931; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11932; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} 11933; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11934; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} 11935; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11936; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 11937; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11938; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} 11939; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11940; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} 11941; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11942; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} 11943; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11944; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} 11945; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11946; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} 11947; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11948; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} 11949; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11950; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} 11951; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11952; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} 11953; AVX512-FCP-NEXT: movw $31, %ax 11954; AVX512-FCP-NEXT: kmovw %eax, %k2 11955; AVX512-FCP-NEXT: vmovdqa32 %zmm13, %zmm28 {%k2} 11956; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11957; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} 11958; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 11959; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11960; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} 11961; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11962; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 11963; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 11964; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 11965; AVX512-FCP-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} 11966; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11967; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 11968; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm24 11969; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} 11970; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11971; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} 11972; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm29 {%k2} 11973; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11974; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} 11975; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 11976; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload 11977; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} 11978; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11979; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 11980; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 11981; AVX512-FCP-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} 11982; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11983; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} 11984; AVX512-FCP-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} 11985; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11986; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} 11987; AVX512-FCP-NEXT: movw $992, %ax # imm = 0x3E0 11988; AVX512-FCP-NEXT: kmovw %eax, %k1 11989; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11990; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} 11991; AVX512-FCP-NEXT: movb $-32, %al 11992; AVX512-FCP-NEXT: kmovw %eax, %k2 11993; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11994; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} 11995; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11996; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} 11997; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11998; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} 11999; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12000; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} 12001; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12002; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} 12003; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12004; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} 12005; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12006; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} 12007; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12008; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 12009; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12010; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} 12011; AVX512-FCP-NEXT: vmovdqa32 %zmm26, %zmm6 {%k1} 12012; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12013; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} 12014; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} 12015; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12016; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} 12017; AVX512-FCP-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} 12018; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12019; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} 12020; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 192(%rsi) 12021; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) 12022; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 64(%rsi) 12023; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rsi) 12024; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx) 12025; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rdx) 12026; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) 12027; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 128(%rdx) 12028; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 192(%rcx) 12029; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) 12030; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) 12031; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 128(%rcx) 12032; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 192(%r8) 12033; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%r8) 12034; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%r8) 12035; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 128(%r8) 12036; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%r9) 12037; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%r9) 12038; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 64(%r9) 12039; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 128(%r9) 12040; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 12041; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) 12042; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) 12043; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rax) 12044; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) 12045; AVX512-FCP-NEXT: addq $2632, %rsp # imm = 0xA48 12046; AVX512-FCP-NEXT: vzeroupper 12047; AVX512-FCP-NEXT: retq 12048; 12049; AVX512DQ-LABEL: load_i32_stride6_vf64: 12050; AVX512DQ: # %bb.0: 12051; AVX512DQ-NEXT: subq $2632, %rsp # imm = 0xA48 12052; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm21 12053; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm1 12054; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm20 12055; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm0 12056; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm2 12057; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm18 12058; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm27 12059; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm3 12060; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm25 12061; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm4 12062; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 12063; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 12064; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 12065; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 12066; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 12067; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 12068; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12069; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 12070; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 12071; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12072; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm7 12073; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 12074; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12075; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 12076; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 12077; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12078; AVX512DQ-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 12079; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12080; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 12081; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 12082; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 12083; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 12084; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 12085; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12086; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 12087; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 12088; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 12089; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 12090; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12091; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 12092; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 12093; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12094; AVX512DQ-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 12095; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12096; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 12097; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 12098; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm7 12099; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 12100; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12101; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm7 12102; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 12103; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12104; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm7 12105; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 12106; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12107; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 12108; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12109; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 12110; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 12111; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm7 12112; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 12113; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12114; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm7 12115; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 12116; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12117; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm7 12118; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 12119; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12120; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 12121; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12122; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 12123; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 12124; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 12125; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 12126; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12127; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 12128; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 12129; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 12130; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12131; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 12132; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 12133; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12134; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 12135; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12136; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 12137; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 12138; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12139; AVX512DQ-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 12140; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12141; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 12142; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12143; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm1 12144; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 12145; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12146; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 12147; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12148; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 12149; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 12150; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 12151; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 12152; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12153; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 12154; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 12155; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 12156; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 12157; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12158; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 12159; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 12160; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 12161; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 12162; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12163; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 12164; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 12165; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 12166; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12167; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm26 12168; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm1 12169; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 12170; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 12171; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12172; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 12173; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 12174; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12175; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm2 12176; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 12177; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12178; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm2 12179; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 12180; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12181; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm2 12182; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 12183; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12184; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 12185; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm22 12186; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm1 12187; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 12188; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 12189; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12190; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm19 12191; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm2 12192; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 12193; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12194; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 12195; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 12196; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12197; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 12198; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12199; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm28 12200; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 12201; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 12202; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm29 12203; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 12204; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 12205; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 12206; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 12207; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12208; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 12209; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12210; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 12211; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 12212; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm2 12213; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm0 12214; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] 12215; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 12216; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 12217; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] 12218; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm21 12219; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 12220; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] 12221; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 12222; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 12223; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12224; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] 12225; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 12226; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 12227; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill 12228; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] 12229; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm23 12230; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 12231; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] 12232; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 12233; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 12234; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm6 12235; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 12236; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm3, %zmm15 12237; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 12238; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm4, %zmm16 12239; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 12240; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm12, %zmm27 12241; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30 12242; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 12243; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm17 12244; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 12245; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 12246; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm10 12247; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm7 12248; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 12249; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm3, %zmm8 12250; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm1 12251; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm0 12252; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 12253; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 12254; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 12255; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 12256; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 12257; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm12, %zmm13 12258; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 12259; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm18 12260; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 12261; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 12262; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 12263; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 12264; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 12265; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 12266; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 12267; AVX512DQ-NEXT: movb $56, %al 12268; AVX512DQ-NEXT: kmovw %eax, %k2 12269; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12270; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} 12271; AVX512DQ-NEXT: movw $-2048, %ax # imm = 0xF800 12272; AVX512DQ-NEXT: kmovw %eax, %k1 12273; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12274; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} 12275; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12276; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} 12277; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12278; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} 12279; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12280; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} 12281; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12282; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} 12283; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12284; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} 12285; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12286; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 12287; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12288; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} 12289; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12290; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} 12291; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12292; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} 12293; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12294; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} 12295; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12296; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} 12297; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12298; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} 12299; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12300; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} 12301; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12302; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} 12303; AVX512DQ-NEXT: movw $31, %ax 12304; AVX512DQ-NEXT: kmovw %eax, %k2 12305; AVX512DQ-NEXT: vmovdqa32 %zmm13, %zmm28 {%k2} 12306; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12307; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} 12308; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 12309; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12310; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} 12311; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12312; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 12313; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 12314; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 12315; AVX512DQ-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} 12316; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12317; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 12318; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm24 12319; AVX512DQ-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} 12320; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12321; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} 12322; AVX512DQ-NEXT: vmovdqa32 %zmm18, %zmm29 {%k2} 12323; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12324; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} 12325; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 12326; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload 12327; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} 12328; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12329; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 12330; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 12331; AVX512DQ-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} 12332; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12333; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} 12334; AVX512DQ-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} 12335; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12336; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} 12337; AVX512DQ-NEXT: movw $992, %ax # imm = 0x3E0 12338; AVX512DQ-NEXT: kmovw %eax, %k1 12339; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12340; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} 12341; AVX512DQ-NEXT: movb $-32, %al 12342; AVX512DQ-NEXT: kmovw %eax, %k2 12343; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12344; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} 12345; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12346; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} 12347; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12348; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} 12349; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12350; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} 12351; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12352; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} 12353; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12354; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} 12355; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12356; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} 12357; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12358; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 12359; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12360; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} 12361; AVX512DQ-NEXT: vmovdqa32 %zmm26, %zmm6 {%k1} 12362; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12363; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} 12364; AVX512DQ-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} 12365; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12366; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} 12367; AVX512DQ-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} 12368; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12369; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} 12370; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rsi) 12371; AVX512DQ-NEXT: vmovdqa64 %zmm8, 128(%rsi) 12372; AVX512DQ-NEXT: vmovdqa64 %zmm20, 64(%rsi) 12373; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rsi) 12374; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%rdx) 12375; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rdx) 12376; AVX512DQ-NEXT: vmovdqa64 %zmm21, 64(%rdx) 12377; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx) 12378; AVX512DQ-NEXT: vmovdqa64 %zmm31, 192(%rcx) 12379; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%rcx) 12380; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rcx) 12381; AVX512DQ-NEXT: vmovdqa64 %zmm28, 128(%rcx) 12382; AVX512DQ-NEXT: vmovdqa64 %zmm25, 192(%r8) 12383; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%r8) 12384; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%r8) 12385; AVX512DQ-NEXT: vmovdqa64 %zmm29, 128(%r8) 12386; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%r9) 12387; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%r9) 12388; AVX512DQ-NEXT: vmovdqa64 %zmm23, 64(%r9) 12389; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%r9) 12390; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 12391; AVX512DQ-NEXT: vmovdqa64 %zmm10, 128(%rax) 12392; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) 12393; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%rax) 12394; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax) 12395; AVX512DQ-NEXT: addq $2632, %rsp # imm = 0xA48 12396; AVX512DQ-NEXT: vzeroupper 12397; AVX512DQ-NEXT: retq 12398; 12399; AVX512DQ-FCP-LABEL: load_i32_stride6_vf64: 12400; AVX512DQ-FCP: # %bb.0: 12401; AVX512DQ-FCP-NEXT: subq $2632, %rsp # imm = 0xA48 12402; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm21 12403; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm1 12404; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm20 12405; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm0 12406; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 12407; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm18 12408; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm27 12409; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3 12410; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25 12411; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 12412; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 12413; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 12414; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 12415; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 12416; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 12417; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 12418; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12419; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 12420; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 12421; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12422; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 12423; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 12424; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12425; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 12426; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 12427; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12428; AVX512DQ-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 12429; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12430; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 12431; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 12432; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 12433; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 12434; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 12435; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12436; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 12437; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 12438; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 12439; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 12440; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12441; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 12442; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 12443; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12444; AVX512DQ-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 12445; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12446; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 12447; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 12448; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 12449; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 12450; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12451; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 12452; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 12453; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12454; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 12455; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 12456; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12457; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 12458; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12459; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 12460; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 12461; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 12462; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 12463; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12464; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 12465; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 12466; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12467; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 12468; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 12469; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12470; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 12471; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12472; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 12473; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 12474; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 12475; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 12476; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12477; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 12478; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 12479; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 12480; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12481; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 12482; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 12483; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12484; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 12485; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12486; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 12487; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 12488; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12489; AVX512DQ-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 12490; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12491; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 12492; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12493; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 12494; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 12495; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12496; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 12497; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12498; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 12499; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 12500; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 12501; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 12502; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12503; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 12504; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 12505; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 12506; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 12507; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12508; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 12509; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 12510; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 12511; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 12512; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12513; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 12514; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 12515; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 12516; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12517; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm26 12518; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 12519; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 12520; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 12521; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12522; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 12523; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 12524; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12525; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 12526; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 12527; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12528; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 12529; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 12530; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12531; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 12532; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 12533; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12534; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 12535; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm22 12536; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm1 12537; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 12538; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 12539; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12540; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm19 12541; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 12542; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 12543; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12544; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 12545; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 12546; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12547; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 12548; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12549; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 12550; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 12551; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 12552; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 12553; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 12554; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 12555; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 12556; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 12557; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12558; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 12559; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12560; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 12561; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 12562; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 12563; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 12564; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] 12565; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 12566; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 12567; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] 12568; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 12569; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 12570; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] 12571; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 12572; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 12573; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12574; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] 12575; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 12576; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 12577; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill 12578; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] 12579; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 12580; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 12581; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] 12582; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 12583; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 12584; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 12585; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 12586; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm3, %zmm15 12587; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 12588; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm4, %zmm16 12589; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 12590; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm27 12591; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 12592; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 12593; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm17 12594; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 12595; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 12596; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 12597; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm7 12598; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 12599; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm3, %zmm8 12600; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm1 12601; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm0 12602; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 12603; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 12604; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 12605; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 12606; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 12607; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm12, %zmm13 12608; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 12609; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 12610; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 12611; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 12612; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 12613; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 12614; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 12615; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 12616; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 12617; AVX512DQ-FCP-NEXT: movb $56, %al 12618; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 12619; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12620; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} 12621; AVX512DQ-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 12622; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 12623; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12624; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} 12625; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12626; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} 12627; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12628; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} 12629; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12630; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} 12631; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12632; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} 12633; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12634; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} 12635; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12636; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 12637; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12638; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} 12639; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12640; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} 12641; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12642; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} 12643; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12644; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} 12645; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12646; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} 12647; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12648; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} 12649; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12650; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} 12651; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12652; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} 12653; AVX512DQ-FCP-NEXT: movw $31, %ax 12654; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 12655; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm13, %zmm28 {%k2} 12656; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12657; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} 12658; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 12659; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12660; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} 12661; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12662; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 12663; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 12664; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 12665; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} 12666; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12667; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 12668; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm24 12669; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} 12670; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12671; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} 12672; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm18, %zmm29 {%k2} 12673; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12674; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} 12675; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 12676; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload 12677; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} 12678; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12679; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 12680; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 12681; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} 12682; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12683; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} 12684; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} 12685; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12686; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} 12687; AVX512DQ-FCP-NEXT: movw $992, %ax # imm = 0x3E0 12688; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 12689; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12690; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} 12691; AVX512DQ-FCP-NEXT: movb $-32, %al 12692; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 12693; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12694; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} 12695; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12696; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} 12697; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12698; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} 12699; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12700; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} 12701; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12702; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} 12703; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12704; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} 12705; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12706; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} 12707; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12708; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 12709; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12710; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} 12711; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm26, %zmm6 {%k1} 12712; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12713; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} 12714; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} 12715; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12716; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} 12717; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} 12718; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12719; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} 12720; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 192(%rsi) 12721; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) 12722; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%rsi) 12723; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, (%rsi) 12724; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx) 12725; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rdx) 12726; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) 12727; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 128(%rdx) 12728; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 192(%rcx) 12729; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) 12730; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) 12731; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 128(%rcx) 12732; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 192(%r8) 12733; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%r8) 12734; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%r8) 12735; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 128(%r8) 12736; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%r9) 12737; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%r9) 12738; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 64(%r9) 12739; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 128(%r9) 12740; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 12741; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) 12742; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) 12743; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rax) 12744; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) 12745; AVX512DQ-FCP-NEXT: addq $2632, %rsp # imm = 0xA48 12746; AVX512DQ-FCP-NEXT: vzeroupper 12747; AVX512DQ-FCP-NEXT: retq 12748; 12749; AVX512BW-LABEL: load_i32_stride6_vf64: 12750; AVX512BW: # %bb.0: 12751; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 12752; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm21 12753; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm1 12754; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm20 12755; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm0 12756; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 12757; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm18 12758; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm27 12759; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm3 12760; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm25 12761; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 12762; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 12763; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 12764; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 12765; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 12766; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 12767; AVX512BW-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 12768; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12769; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 12770; AVX512BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 12771; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12772; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7 12773; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 12774; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12775; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 12776; AVX512BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 12777; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12778; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 12779; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12780; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 12781; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 12782; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 12783; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 12784; AVX512BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 12785; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12786; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 12787; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 12788; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 12789; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 12790; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12791; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 12792; AVX512BW-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 12793; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12794; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 12795; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12796; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 12797; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 12798; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 12799; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 12800; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12801; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 12802; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 12803; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12804; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 12805; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 12806; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12807; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 12808; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12809; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 12810; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 12811; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 12812; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 12813; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12814; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 12815; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 12816; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12817; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 12818; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 12819; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12820; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 12821; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12822; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 12823; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 12824; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 12825; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 12826; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12827; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 12828; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 12829; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 12830; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12831; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 12832; AVX512BW-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 12833; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12834; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 12835; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12836; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 12837; AVX512BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 12838; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12839; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 12840; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12841; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 12842; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12843; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 12844; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 12845; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12846; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 12847; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12848; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 12849; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 12850; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 12851; AVX512BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 12852; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12853; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 12854; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 12855; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 12856; AVX512BW-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 12857; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12858; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 12859; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 12860; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 12861; AVX512BW-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 12862; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12863; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 12864; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 12865; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 12866; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12867; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm26 12868; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 12869; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 12870; AVX512BW-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 12871; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12872; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 12873; AVX512BW-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 12874; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12875; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 12876; AVX512BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 12877; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12878; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 12879; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 12880; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12881; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 12882; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 12883; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12884; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 12885; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm22 12886; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm1 12887; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 12888; AVX512BW-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 12889; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12890; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm19 12891; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 12892; AVX512BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 12893; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12894; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 12895; AVX512BW-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 12896; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12897; AVX512BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 12898; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12899; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm28 12900; AVX512BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 12901; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 12902; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 12903; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 12904; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 12905; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 12906; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 12907; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12908; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 12909; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12910; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 12911; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 12912; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 12913; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 12914; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] 12915; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 12916; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 12917; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] 12918; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 12919; AVX512BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 12920; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] 12921; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 12922; AVX512BW-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 12923; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12924; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] 12925; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 12926; AVX512BW-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 12927; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill 12928; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] 12929; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 12930; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 12931; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] 12932; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 12933; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 12934; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 12935; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 12936; AVX512BW-NEXT: vpermt2d %zmm6, %zmm3, %zmm15 12937; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 12938; AVX512BW-NEXT: vpermt2d %zmm6, %zmm4, %zmm16 12939; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 12940; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm27 12941; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 12942; AVX512BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 12943; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm17 12944; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 12945; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 12946; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10 12947; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm7 12948; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 12949; AVX512BW-NEXT: vpermt2d %zmm10, %zmm3, %zmm8 12950; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 12951; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm0 12952; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 12953; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 12954; AVX512BW-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 12955; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 12956; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 12957; AVX512BW-NEXT: vpermt2d %zmm10, %zmm12, %zmm13 12958; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 12959; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18 12960; AVX512BW-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 12961; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 12962; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 12963; AVX512BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 12964; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 12965; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 12966; AVX512BW-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 12967; AVX512BW-NEXT: movb $56, %al 12968; AVX512BW-NEXT: kmovd %eax, %k2 12969; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12970; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} 12971; AVX512BW-NEXT: movw $-2048, %ax # imm = 0xF800 12972; AVX512BW-NEXT: kmovd %eax, %k1 12973; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12974; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} 12975; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12976; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} 12977; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12978; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} 12979; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12980; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} 12981; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12982; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} 12983; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12984; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} 12985; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12986; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 12987; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12988; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} 12989; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12990; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} 12991; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12992; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} 12993; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12994; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} 12995; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12996; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} 12997; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12998; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} 12999; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13000; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} 13001; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13002; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} 13003; AVX512BW-NEXT: movw $31, %ax 13004; AVX512BW-NEXT: kmovd %eax, %k2 13005; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm28 {%k2} 13006; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13007; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} 13008; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 13009; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13010; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} 13011; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13012; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 13013; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 13014; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 13015; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} 13016; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13017; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 13018; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 13019; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} 13020; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13021; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} 13022; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm29 {%k2} 13023; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13024; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} 13025; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 13026; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload 13027; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} 13028; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13029; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 13030; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 13031; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} 13032; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13033; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} 13034; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} 13035; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13036; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} 13037; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0 13038; AVX512BW-NEXT: kmovd %eax, %k1 13039; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13040; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} 13041; AVX512BW-NEXT: movb $-32, %al 13042; AVX512BW-NEXT: kmovd %eax, %k2 13043; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13044; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} 13045; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13046; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} 13047; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13048; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} 13049; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13050; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} 13051; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13052; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} 13053; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13054; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} 13055; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13056; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} 13057; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13058; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 13059; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13060; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} 13061; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm6 {%k1} 13062; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13063; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} 13064; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} 13065; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13066; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} 13067; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} 13068; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13069; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} 13070; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rsi) 13071; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) 13072; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rsi) 13073; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rsi) 13074; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) 13075; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rdx) 13076; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) 13077; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rdx) 13078; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%rcx) 13079; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) 13080; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) 13081; AVX512BW-NEXT: vmovdqa64 %zmm28, 128(%rcx) 13082; AVX512BW-NEXT: vmovdqa64 %zmm25, 192(%r8) 13083; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r8) 13084; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%r8) 13085; AVX512BW-NEXT: vmovdqa64 %zmm29, 128(%r8) 13086; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%r9) 13087; AVX512BW-NEXT: vmovdqa64 %zmm17, (%r9) 13088; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r9) 13089; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%r9) 13090; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 13091; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rax) 13092; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) 13093; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) 13094; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) 13095; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 13096; AVX512BW-NEXT: vzeroupper 13097; AVX512BW-NEXT: retq 13098; 13099; AVX512BW-FCP-LABEL: load_i32_stride6_vf64: 13100; AVX512BW-FCP: # %bb.0: 13101; AVX512BW-FCP-NEXT: subq $2632, %rsp # imm = 0xA48 13102; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm21 13103; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm1 13104; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm20 13105; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm0 13106; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 13107; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm18 13108; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm27 13109; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3 13110; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25 13111; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 13112; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 13113; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 13114; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 13115; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 13116; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 13117; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 13118; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13119; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 13120; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 13121; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13122; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 13123; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 13124; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13125; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 13126; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 13127; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13128; AVX512BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 13129; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13130; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 13131; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 13132; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 13133; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 13134; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 13135; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13136; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 13137; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 13138; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 13139; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 13140; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13141; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 13142; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 13143; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13144; AVX512BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 13145; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13146; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 13147; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 13148; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 13149; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 13150; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13151; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 13152; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 13153; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13154; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 13155; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 13156; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13157; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 13158; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13159; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 13160; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 13161; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 13162; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 13163; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13164; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 13165; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 13166; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13167; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 13168; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 13169; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13170; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 13171; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13172; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 13173; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 13174; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 13175; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 13176; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13177; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 13178; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 13179; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 13180; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13181; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 13182; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 13183; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13184; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 13185; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13186; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 13187; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 13188; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13189; AVX512BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 13190; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13191; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 13192; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13193; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 13194; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 13195; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13196; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 13197; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13198; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 13199; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 13200; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 13201; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 13202; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13203; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 13204; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 13205; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 13206; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 13207; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13208; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 13209; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 13210; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 13211; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 13212; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13213; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 13214; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 13215; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 13216; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13217; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm26 13218; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 13219; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 13220; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 13221; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13222; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 13223; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 13224; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13225; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 13226; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 13227; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13228; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 13229; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 13230; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13231; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 13232; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 13233; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13234; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 13235; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm22 13236; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm1 13237; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 13238; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 13239; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13240; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm19 13241; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 13242; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 13243; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13244; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 13245; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 13246; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13247; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 13248; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13249; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 13250; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 13251; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 13252; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 13253; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 13254; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 13255; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 13256; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 13257; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13258; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 13259; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13260; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 13261; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 13262; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 13263; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 13264; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] 13265; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 13266; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 13267; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] 13268; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 13269; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 13270; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] 13271; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 13272; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 13273; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13274; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] 13275; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 13276; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 13277; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill 13278; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] 13279; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 13280; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 13281; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] 13282; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 13283; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 13284; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 13285; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 13286; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm3, %zmm15 13287; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 13288; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm4, %zmm16 13289; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 13290; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm27 13291; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 13292; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 13293; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm17 13294; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 13295; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 13296; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 13297; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm7 13298; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 13299; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm3, %zmm8 13300; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm1 13301; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm0 13302; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 13303; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 13304; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 13305; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 13306; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 13307; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm12, %zmm13 13308; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 13309; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 13310; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 13311; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 13312; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 13313; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 13314; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 13315; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 13316; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 13317; AVX512BW-FCP-NEXT: movb $56, %al 13318; AVX512BW-FCP-NEXT: kmovd %eax, %k2 13319; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13320; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} 13321; AVX512BW-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 13322; AVX512BW-FCP-NEXT: kmovd %eax, %k1 13323; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13324; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} 13325; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13326; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} 13327; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13328; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} 13329; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13330; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} 13331; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13332; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} 13333; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13334; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} 13335; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13336; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 13337; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13338; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} 13339; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13340; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} 13341; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13342; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} 13343; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13344; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} 13345; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13346; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} 13347; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13348; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} 13349; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13350; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} 13351; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13352; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} 13353; AVX512BW-FCP-NEXT: movw $31, %ax 13354; AVX512BW-FCP-NEXT: kmovd %eax, %k2 13355; AVX512BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm28 {%k2} 13356; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13357; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} 13358; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 13359; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13360; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} 13361; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13362; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 13363; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 13364; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 13365; AVX512BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} 13366; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13367; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 13368; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm24 13369; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} 13370; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13371; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} 13372; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm29 {%k2} 13373; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13374; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} 13375; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 13376; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload 13377; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} 13378; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13379; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 13380; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 13381; AVX512BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} 13382; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13383; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} 13384; AVX512BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} 13385; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13386; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} 13387; AVX512BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 13388; AVX512BW-FCP-NEXT: kmovd %eax, %k1 13389; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13390; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} 13391; AVX512BW-FCP-NEXT: movb $-32, %al 13392; AVX512BW-FCP-NEXT: kmovd %eax, %k2 13393; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13394; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} 13395; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13396; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} 13397; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13398; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} 13399; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13400; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} 13401; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13402; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} 13403; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13404; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} 13405; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13406; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} 13407; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13408; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 13409; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13410; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} 13411; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm6 {%k1} 13412; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13413; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} 13414; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} 13415; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13416; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} 13417; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} 13418; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13419; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} 13420; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rsi) 13421; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) 13422; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rsi) 13423; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rsi) 13424; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx) 13425; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%rdx) 13426; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) 13427; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rdx) 13428; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 192(%rcx) 13429; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) 13430; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) 13431; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 128(%rcx) 13432; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 192(%r8) 13433; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r8) 13434; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%r8) 13435; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 128(%r8) 13436; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%r9) 13437; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%r9) 13438; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%r9) 13439; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%r9) 13440; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 13441; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) 13442; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) 13443; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) 13444; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) 13445; AVX512BW-FCP-NEXT: addq $2632, %rsp # imm = 0xA48 13446; AVX512BW-FCP-NEXT: vzeroupper 13447; AVX512BW-FCP-NEXT: retq 13448; 13449; AVX512DQ-BW-LABEL: load_i32_stride6_vf64: 13450; AVX512DQ-BW: # %bb.0: 13451; AVX512DQ-BW-NEXT: subq $2632, %rsp # imm = 0xA48 13452; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm21 13453; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm1 13454; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm20 13455; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm0 13456; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm2 13457; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm18 13458; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm27 13459; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm3 13460; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm25 13461; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm4 13462; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 13463; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 13464; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 13465; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 13466; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 13467; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 13468; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13469; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 13470; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 13471; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13472; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm7 13473; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 13474; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13475; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 13476; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 13477; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13478; AVX512DQ-BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 13479; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13480; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 13481; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 13482; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 13483; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 13484; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 13485; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13486; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 13487; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 13488; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 13489; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 13490; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13491; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 13492; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 13493; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13494; AVX512DQ-BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 13495; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13496; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 13497; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 13498; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm7 13499; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 13500; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13501; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm7 13502; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 13503; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13504; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm7 13505; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 13506; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13507; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 13508; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13509; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 13510; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 13511; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm7 13512; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 13513; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13514; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm7 13515; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 13516; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13517; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm7 13518; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 13519; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13520; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 13521; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13522; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 13523; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 13524; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 13525; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 13526; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13527; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 13528; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 13529; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 13530; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13531; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 13532; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 13533; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13534; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 13535; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13536; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 13537; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 13538; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13539; AVX512DQ-BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 13540; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13541; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 13542; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13543; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 13544; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 13545; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13546; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 13547; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13548; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 13549; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 13550; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 13551; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 13552; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13553; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 13554; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 13555; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 13556; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 13557; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13558; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 13559; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 13560; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 13561; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 13562; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13563; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 13564; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 13565; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 13566; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13567; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm26 13568; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm1 13569; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 13570; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 13571; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13572; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 13573; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 13574; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13575; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 13576; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 13577; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13578; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 13579; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 13580; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13581; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 13582; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 13583; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13584; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 13585; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm22 13586; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm1 13587; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 13588; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 13589; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13590; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm19 13591; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 13592; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 13593; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13594; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 13595; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 13596; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13597; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 13598; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13599; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm28 13600; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 13601; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 13602; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm29 13603; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 13604; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 13605; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 13606; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 13607; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13608; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 13609; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13610; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 13611; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 13612; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm2 13613; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm0 13614; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] 13615; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 13616; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 13617; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] 13618; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 13619; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 13620; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] 13621; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 13622; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 13623; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13624; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] 13625; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 13626; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 13627; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill 13628; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] 13629; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm23 13630; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 13631; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] 13632; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 13633; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 13634; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm6 13635; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 13636; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm3, %zmm15 13637; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 13638; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm4, %zmm16 13639; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 13640; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm27 13641; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30 13642; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 13643; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm17 13644; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 13645; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 13646; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm10 13647; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm7 13648; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 13649; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm3, %zmm8 13650; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 13651; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm0 13652; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 13653; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 13654; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 13655; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 13656; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 13657; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm12, %zmm13 13658; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 13659; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm18 13660; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 13661; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 13662; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 13663; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 13664; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 13665; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 13666; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 13667; AVX512DQ-BW-NEXT: movb $56, %al 13668; AVX512DQ-BW-NEXT: kmovd %eax, %k2 13669; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13670; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} 13671; AVX512DQ-BW-NEXT: movw $-2048, %ax # imm = 0xF800 13672; AVX512DQ-BW-NEXT: kmovd %eax, %k1 13673; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13674; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} 13675; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13676; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} 13677; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13678; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} 13679; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13680; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} 13681; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13682; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} 13683; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13684; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} 13685; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13686; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 13687; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13688; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} 13689; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13690; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} 13691; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13692; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} 13693; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13694; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} 13695; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13696; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} 13697; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13698; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} 13699; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13700; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} 13701; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13702; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} 13703; AVX512DQ-BW-NEXT: movw $31, %ax 13704; AVX512DQ-BW-NEXT: kmovd %eax, %k2 13705; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm28 {%k2} 13706; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13707; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} 13708; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 13709; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13710; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} 13711; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13712; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 13713; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 13714; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 13715; AVX512DQ-BW-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} 13716; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13717; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 13718; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm24 13719; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} 13720; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13721; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} 13722; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm29 {%k2} 13723; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13724; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} 13725; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 13726; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload 13727; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} 13728; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13729; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 13730; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 13731; AVX512DQ-BW-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} 13732; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13733; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} 13734; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} 13735; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13736; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} 13737; AVX512DQ-BW-NEXT: movw $992, %ax # imm = 0x3E0 13738; AVX512DQ-BW-NEXT: kmovd %eax, %k1 13739; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13740; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} 13741; AVX512DQ-BW-NEXT: movb $-32, %al 13742; AVX512DQ-BW-NEXT: kmovd %eax, %k2 13743; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13744; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} 13745; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13746; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} 13747; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13748; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} 13749; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13750; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} 13751; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13752; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} 13753; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13754; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} 13755; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13756; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} 13757; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13758; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 13759; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13760; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} 13761; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm6 {%k1} 13762; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13763; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} 13764; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} 13765; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13766; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} 13767; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} 13768; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13769; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} 13770; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 192(%rsi) 13771; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) 13772; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 64(%rsi) 13773; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%rsi) 13774; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) 13775; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rdx) 13776; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) 13777; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 128(%rdx) 13778; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 192(%rcx) 13779; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%rcx) 13780; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) 13781; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 128(%rcx) 13782; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 192(%r8) 13783; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%r8) 13784; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%r8) 13785; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 128(%r8) 13786; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 192(%r9) 13787; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%r9) 13788; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 64(%r9) 13789; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 128(%r9) 13790; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 13791; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 128(%rax) 13792; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%rax) 13793; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rax) 13794; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax) 13795; AVX512DQ-BW-NEXT: addq $2632, %rsp # imm = 0xA48 13796; AVX512DQ-BW-NEXT: vzeroupper 13797; AVX512DQ-BW-NEXT: retq 13798; 13799; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf64: 13800; AVX512DQ-BW-FCP: # %bb.0: 13801; AVX512DQ-BW-FCP-NEXT: subq $2632, %rsp # imm = 0xA48 13802; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm21 13803; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm1 13804; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm20 13805; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm0 13806; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 13807; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm18 13808; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm27 13809; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3 13810; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25 13811; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 13812; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] 13813; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 13814; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 13815; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 13816; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 13817; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 13818; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13819; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 13820; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 13821; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13822; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 13823; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 13824; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13825; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 13826; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 13827; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13828; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 13829; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13830; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 13831; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 13832; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 13833; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 13834; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 13835; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13836; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] 13837; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 13838; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 13839; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 13840; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13841; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 13842; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 13843; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13844; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 13845; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13846; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 13847; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 13848; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 13849; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 13850; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13851; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 13852; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 13853; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13854; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 13855; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 13856; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13857; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 13858; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13859; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 13860; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 13861; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 13862; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 13863; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13864; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 13865; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 13866; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13867; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 13868; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 13869; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13870; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 13871; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13872; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 13873; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 13874; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 13875; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 13876; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13877; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 13878; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 13879; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 13880; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13881; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 13882; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 13883; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13884; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 13885; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13886; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 13887; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 13888; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13889; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 13890; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13891; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 13892; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13893; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 13894; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 13895; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13896; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 13897; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13898; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] 13899; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 13900; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 13901; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 13902; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13903; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] 13904; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 13905; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 13906; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 13907; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13908; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] 13909; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 13910; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 13911; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 13912; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13913; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] 13914; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 13915; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 13916; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13917; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm26 13918; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 13919; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 13920; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 13921; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13922; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 13923; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 13924; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13925; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 13926; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 13927; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13928; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 13929; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 13930; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13931; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 13932; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 13933; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13934; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 13935; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm22 13936; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm1 13937; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 13938; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 13939; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13940; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm19 13941; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 13942; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 13943; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13944; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 13945; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 13946; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13947; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 13948; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13949; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 13950; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 13951; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 13952; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 13953; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 13954; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 13955; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 13956; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 13957; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13958; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 13959; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13960; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 13961; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 13962; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 13963; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 13964; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] 13965; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 13966; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 13967; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] 13968; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 13969; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 13970; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] 13971; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 13972; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 13973; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13974; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] 13975; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 13976; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 13977; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill 13978; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] 13979; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 13980; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 13981; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] 13982; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 13983; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 13984; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 13985; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 13986; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm3, %zmm15 13987; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 13988; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm4, %zmm16 13989; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 13990; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm27 13991; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 13992; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 13993; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm17 13994; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 13995; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 13996; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 13997; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm7 13998; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 13999; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm3, %zmm8 14000; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm1 14001; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm0 14002; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 14003; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 14004; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 14005; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 14006; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 14007; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm12, %zmm13 14008; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 14009; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 14010; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 14011; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 14012; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 14013; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 14014; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 14015; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 14016; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 14017; AVX512DQ-BW-FCP-NEXT: movb $56, %al 14018; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 14019; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14020; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} 14021; AVX512DQ-BW-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 14022; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 14023; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14024; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} 14025; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14026; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} 14027; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14028; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} 14029; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14030; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} 14031; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14032; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} 14033; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14034; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} 14035; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14036; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 14037; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14038; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} 14039; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14040; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} 14041; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14042; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} 14043; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14044; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} 14045; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14046; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} 14047; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14048; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} 14049; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14050; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} 14051; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14052; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} 14053; AVX512DQ-BW-FCP-NEXT: movw $31, %ax 14054; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 14055; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm28 {%k2} 14056; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14057; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} 14058; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 14059; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14060; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} 14061; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14062; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 14063; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 14064; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 14065; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} 14066; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14067; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 14068; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm24 14069; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} 14070; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14071; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} 14072; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm29 {%k2} 14073; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14074; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} 14075; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 14076; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload 14077; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} 14078; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14079; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 14080; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 14081; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} 14082; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14083; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} 14084; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} 14085; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14086; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} 14087; AVX512DQ-BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 14088; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 14089; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14090; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} 14091; AVX512DQ-BW-FCP-NEXT: movb $-32, %al 14092; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 14093; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14094; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} 14095; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14096; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} 14097; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14098; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} 14099; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14100; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} 14101; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14102; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} 14103; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14104; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} 14105; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14106; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} 14107; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14108; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 14109; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14110; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} 14111; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm6 {%k1} 14112; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14113; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} 14114; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} 14115; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14116; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} 14117; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} 14118; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 14119; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} 14120; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rsi) 14121; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) 14122; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rsi) 14123; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rsi) 14124; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx) 14125; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%rdx) 14126; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) 14127; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rdx) 14128; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 192(%rcx) 14129; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) 14130; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) 14131; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 128(%rcx) 14132; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 192(%r8) 14133; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r8) 14134; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%r8) 14135; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 128(%r8) 14136; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%r9) 14137; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%r9) 14138; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%r9) 14139; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%r9) 14140; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 14141; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) 14142; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) 14143; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) 14144; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) 14145; AVX512DQ-BW-FCP-NEXT: addq $2632, %rsp # imm = 0xA48 14146; AVX512DQ-BW-FCP-NEXT: vzeroupper 14147; AVX512DQ-BW-FCP-NEXT: retq 14148 %wide.vec = load <384 x i32>, ptr %in.vec, align 64 14149 %strided.vec0 = shufflevector <384 x i32> %wide.vec, <384 x i32> poison, <64 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186, i32 192, i32 198, i32 204, i32 210, i32 216, i32 222, i32 228, i32 234, i32 240, i32 246, i32 252, i32 258, i32 264, i32 270, i32 276, i32 282, i32 288, i32 294, i32 300, i32 306, i32 312, i32 318, i32 324, i32 330, i32 336, i32 342, i32 348, i32 354, i32 360, i32 366, i32 372, i32 378> 14150 %strided.vec1 = shufflevector <384 x i32> %wide.vec, <384 x i32> poison, <64 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91, i32 97, i32 103, i32 109, i32 115, i32 121, i32 127, i32 133, i32 139, i32 145, i32 151, i32 157, i32 163, i32 169, i32 175, i32 181, i32 187, i32 193, i32 199, i32 205, i32 211, i32 217, i32 223, i32 229, i32 235, i32 241, i32 247, i32 253, i32 259, i32 265, i32 271, i32 277, i32 283, i32 289, i32 295, i32 301, i32 307, i32 313, i32 319, i32 325, i32 331, i32 337, i32 343, i32 349, i32 355, i32 361, i32 367, i32 373, i32 379> 14151 %strided.vec2 = shufflevector <384 x i32> %wide.vec, <384 x i32> poison, <64 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92, i32 98, i32 104, i32 110, i32 116, i32 122, i32 128, i32 134, i32 140, i32 146, i32 152, i32 158, i32 164, i32 170, i32 176, i32 182, i32 188, i32 194, i32 200, i32 206, i32 212, i32 218, i32 224, i32 230, i32 236, i32 242, i32 248, i32 254, i32 260, i32 266, i32 272, i32 278, i32 284, i32 290, i32 296, i32 302, i32 308, i32 314, i32 320, i32 326, i32 332, i32 338, i32 344, i32 350, i32 356, i32 362, i32 368, i32 374, i32 380> 14152 %strided.vec3 = shufflevector <384 x i32> %wide.vec, <384 x i32> poison, <64 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93, i32 99, i32 105, i32 111, i32 117, i32 123, i32 129, i32 135, i32 141, i32 147, i32 153, i32 159, i32 165, i32 171, i32 177, i32 183, i32 189, i32 195, i32 201, i32 207, i32 213, i32 219, i32 225, i32 231, i32 237, i32 243, i32 249, i32 255, i32 261, i32 267, i32 273, i32 279, i32 285, i32 291, i32 297, i32 303, i32 309, i32 315, i32 321, i32 327, i32 333, i32 339, i32 345, i32 351, i32 357, i32 363, i32 369, i32 375, i32 381> 14153 %strided.vec4 = shufflevector <384 x i32> %wide.vec, <384 x i32> poison, <64 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94, i32 100, i32 106, i32 112, i32 118, i32 124, i32 130, i32 136, i32 142, i32 148, i32 154, i32 160, i32 166, i32 172, i32 178, i32 184, i32 190, i32 196, i32 202, i32 208, i32 214, i32 220, i32 226, i32 232, i32 238, i32 244, i32 250, i32 256, i32 262, i32 268, i32 274, i32 280, i32 286, i32 292, i32 298, i32 304, i32 310, i32 316, i32 322, i32 328, i32 334, i32 340, i32 346, i32 352, i32 358, i32 364, i32 370, i32 376, i32 382> 14154 %strided.vec5 = shufflevector <384 x i32> %wide.vec, <384 x i32> poison, <64 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95, i32 101, i32 107, i32 113, i32 119, i32 125, i32 131, i32 137, i32 143, i32 149, i32 155, i32 161, i32 167, i32 173, i32 179, i32 185, i32 191, i32 197, i32 203, i32 209, i32 215, i32 221, i32 227, i32 233, i32 239, i32 245, i32 251, i32 257, i32 263, i32 269, i32 275, i32 281, i32 287, i32 293, i32 299, i32 305, i32 311, i32 317, i32 323, i32 329, i32 335, i32 341, i32 347, i32 353, i32 359, i32 365, i32 371, i32 377, i32 383> 14155 store <64 x i32> %strided.vec0, ptr %out.vec0, align 64 14156 store <64 x i32> %strided.vec1, ptr %out.vec1, align 64 14157 store <64 x i32> %strided.vec2, ptr %out.vec2, align 64 14158 store <64 x i32> %strided.vec3, ptr %out.vec3, align 64 14159 store <64 x i32> %strided.vec4, ptr %out.vec4, align 64 14160 store <64 x i32> %strided.vec5, ptr %out.vec5, align 64 14161 ret void 14162} 14163