1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved stores. 17 18define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { 19; SSE-LABEL: store_i32_stride6_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 22; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 23; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 24; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero 25; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero 26; SSE-NEXT: movaps %xmm0, %xmm4 27; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] 28; SSE-NEXT: movaps %xmm2, %xmm5 29; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] 30; SSE-NEXT: movsd {{.*#+}} xmm6 = mem[0],zero 31; SSE-NEXT: movsd {{.*#+}} xmm7 = mem[0],zero 32; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] 33; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 34; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 35; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 36; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm7[1,3] 37; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm4[1,3] 38; SSE-NEXT: movaps %xmm5, 32(%rax) 39; SSE-NEXT: movaps %xmm7, 16(%rax) 40; SSE-NEXT: movaps %xmm0, (%rax) 41; SSE-NEXT: retq 42; 43; AVX-LABEL: store_i32_stride6_vf2: 44; AVX: # %bb.0: 45; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 46; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 47; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 48; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] 49; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero 50; AVX-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero 51; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] 52; AVX-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero 53; AVX-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero 54; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0] 55; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 56; AVX-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[u,u,0,2,u,u,5,7] 57; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 58; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] 59; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2,2,3] 60; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 61; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 62; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,3],xmm4[1,3] 63; AVX-NEXT: vmovaps %xmm1, 32(%rax) 64; AVX-NEXT: vmovaps %ymm0, (%rax) 65; AVX-NEXT: vzeroupper 66; AVX-NEXT: retq 67; 68; AVX2-LABEL: store_i32_stride6_vf2: 69; AVX2: # %bb.0: 70; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 71; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 72; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 73; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 74; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 75; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 76; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 77; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 78; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 79; AVX2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero 80; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] 81; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] 82; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] 83; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3] 84; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 85; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] 86; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] 87; AVX2-NEXT: vmovaps %xmm1, 32(%rax) 88; AVX2-NEXT: vmovaps %ymm0, (%rax) 89; AVX2-NEXT: vzeroupper 90; AVX2-NEXT: retq 91; 92; AVX2-FP-LABEL: store_i32_stride6_vf2: 93; AVX2-FP: # %bb.0: 94; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 95; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 96; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 97; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 98; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 99; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 100; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 101; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 102; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 103; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero 104; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] 105; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] 106; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] 107; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3] 108; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 109; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] 110; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] 111; AVX2-FP-NEXT: vmovaps %xmm1, 32(%rax) 112; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) 113; AVX2-FP-NEXT: vzeroupper 114; AVX2-FP-NEXT: retq 115; 116; AVX2-FCP-LABEL: store_i32_stride6_vf2: 117; AVX2-FCP: # %bb.0: 118; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 119; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 120; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 121; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 122; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 123; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 124; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 125; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 126; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 127; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 128; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] 129; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7] 130; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] 131; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm4 = [0,2,4,6,u,u,1,3] 132; AVX2-FCP-NEXT: vpermps %ymm0, %ymm4, %ymm0 133; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 134; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 135; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 136; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) 137; AVX2-FCP-NEXT: vmovaps %xmm3, 32(%rax) 138; AVX2-FCP-NEXT: vzeroupper 139; AVX2-FCP-NEXT: retq 140; 141; AVX512-LABEL: store_i32_stride6_vf2: 142; AVX512: # %bb.0: 143; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 144; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 145; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 146; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 147; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 148; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 149; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 150; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 151; AVX512-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero 152; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] 153; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 154; AVX512-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 155; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] 156; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0 157; AVX512-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) 158; AVX512-NEXT: vmovaps %ymm0, (%rax) 159; AVX512-NEXT: vzeroupper 160; AVX512-NEXT: retq 161; 162; AVX512-FCP-LABEL: store_i32_stride6_vf2: 163; AVX512-FCP: # %bb.0: 164; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 165; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 166; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 167; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 168; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 169; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 170; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 171; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 172; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero 173; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] 174; AVX512-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 175; AVX512-FCP-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 176; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] 177; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 178; AVX512-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) 179; AVX512-FCP-NEXT: vmovaps %ymm0, (%rax) 180; AVX512-FCP-NEXT: vzeroupper 181; AVX512-FCP-NEXT: retq 182; 183; AVX512DQ-LABEL: store_i32_stride6_vf2: 184; AVX512DQ: # %bb.0: 185; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 186; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 187; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 188; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 189; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 190; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 191; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 192; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 193; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero 194; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] 195; AVX512DQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 196; AVX512DQ-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 197; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] 198; AVX512DQ-NEXT: vpermps %zmm0, %zmm1, %zmm0 199; AVX512DQ-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) 200; AVX512DQ-NEXT: vmovaps %ymm0, (%rax) 201; AVX512DQ-NEXT: vzeroupper 202; AVX512DQ-NEXT: retq 203; 204; AVX512DQ-FCP-LABEL: store_i32_stride6_vf2: 205; AVX512DQ-FCP: # %bb.0: 206; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 207; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 208; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 209; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 210; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 211; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 212; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 213; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 214; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero 215; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] 216; AVX512DQ-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 217; AVX512DQ-FCP-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 218; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] 219; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 220; AVX512DQ-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) 221; AVX512DQ-FCP-NEXT: vmovaps %ymm0, (%rax) 222; AVX512DQ-FCP-NEXT: vzeroupper 223; AVX512DQ-FCP-NEXT: retq 224; 225; AVX512BW-LABEL: store_i32_stride6_vf2: 226; AVX512BW: # %bb.0: 227; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 228; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 229; AVX512BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 230; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 231; AVX512BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 232; AVX512BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 233; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 234; AVX512BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 235; AVX512BW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero 236; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] 237; AVX512BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 238; AVX512BW-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 239; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] 240; AVX512BW-NEXT: vpermps %zmm0, %zmm1, %zmm0 241; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) 242; AVX512BW-NEXT: vmovaps %ymm0, (%rax) 243; AVX512BW-NEXT: vzeroupper 244; AVX512BW-NEXT: retq 245; 246; AVX512BW-FCP-LABEL: store_i32_stride6_vf2: 247; AVX512BW-FCP: # %bb.0: 248; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 249; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 250; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 251; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 252; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 253; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 254; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 255; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 256; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero 257; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] 258; AVX512BW-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 259; AVX512BW-FCP-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 260; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] 261; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 262; AVX512BW-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) 263; AVX512BW-FCP-NEXT: vmovaps %ymm0, (%rax) 264; AVX512BW-FCP-NEXT: vzeroupper 265; AVX512BW-FCP-NEXT: retq 266; 267; AVX512DQ-BW-LABEL: store_i32_stride6_vf2: 268; AVX512DQ-BW: # %bb.0: 269; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 270; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 271; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 272; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 273; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 274; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 275; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 276; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 277; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero 278; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] 279; AVX512DQ-BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 280; AVX512DQ-BW-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 281; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] 282; AVX512DQ-BW-NEXT: vpermps %zmm0, %zmm1, %zmm0 283; AVX512DQ-BW-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) 284; AVX512DQ-BW-NEXT: vmovaps %ymm0, (%rax) 285; AVX512DQ-BW-NEXT: vzeroupper 286; AVX512DQ-BW-NEXT: retq 287; 288; AVX512DQ-BW-FCP-LABEL: store_i32_stride6_vf2: 289; AVX512DQ-BW-FCP: # %bb.0: 290; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 291; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 292; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 293; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 294; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 295; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 296; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 297; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 298; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero 299; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] 300; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 301; AVX512DQ-BW-FCP-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 302; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] 303; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 304; AVX512DQ-BW-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) 305; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm0, (%rax) 306; AVX512DQ-BW-FCP-NEXT: vzeroupper 307; AVX512DQ-BW-FCP-NEXT: retq 308 %in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64 309 %in.vec1 = load <2 x i32>, ptr %in.vecptr1, align 64 310 %in.vec2 = load <2 x i32>, ptr %in.vecptr2, align 64 311 %in.vec3 = load <2 x i32>, ptr %in.vecptr3, align 64 312 %in.vec4 = load <2 x i32>, ptr %in.vecptr4, align 64 313 %in.vec5 = load <2 x i32>, ptr %in.vecptr5, align 64 314 %1 = shufflevector <2 x i32> %in.vec0, <2 x i32> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 315 %2 = shufflevector <2 x i32> %in.vec2, <2 x i32> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 316 %3 = shufflevector <2 x i32> %in.vec4, <2 x i32> %in.vec5, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 317 %4 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 318 %5 = shufflevector <4 x i32> %3, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 319 %6 = shufflevector <8 x i32> %4, <8 x i32> %5, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 320 %interleaved.vec = shufflevector <12 x i32> %6, <12 x i32> poison, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11> 321 store <12 x i32> %interleaved.vec, ptr %out.vec, align 64 322 ret void 323} 324 325define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { 326; SSE-LABEL: store_i32_stride6_vf4: 327; SSE: # %bb.0: 328; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 329; SSE-NEXT: movaps (%rdi), %xmm0 330; SSE-NEXT: movaps (%rsi), %xmm4 331; SSE-NEXT: movaps (%rdx), %xmm1 332; SSE-NEXT: movaps (%rcx), %xmm5 333; SSE-NEXT: movaps (%r8), %xmm7 334; SSE-NEXT: movaps (%r9), %xmm3 335; SSE-NEXT: movaps %xmm1, %xmm2 336; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 337; SSE-NEXT: movaps %xmm7, %xmm6 338; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] 339; SSE-NEXT: movaps %xmm7, %xmm8 340; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm3[3,3] 341; SSE-NEXT: movaps %xmm7, %xmm9 342; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm3[1,1] 343; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] 344; SSE-NEXT: movaps %xmm0, %xmm7 345; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] 346; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm7[2,3] 347; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] 348; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] 349; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[2,3] 350; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] 351; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 352; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm8[0,2] 353; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm9[0,2] 354; SSE-NEXT: movaps %xmm3, 16(%rax) 355; SSE-NEXT: movaps %xmm2, 32(%rax) 356; SSE-NEXT: movaps %xmm0, 48(%rax) 357; SSE-NEXT: movaps %xmm1, 80(%rax) 358; SSE-NEXT: movaps %xmm6, 64(%rax) 359; SSE-NEXT: movaps %xmm7, (%rax) 360; SSE-NEXT: retq 361; 362; AVX-LABEL: store_i32_stride6_vf4: 363; AVX: # %bb.0: 364; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 365; AVX-NEXT: vmovaps (%rdi), %xmm0 366; AVX-NEXT: vmovaps (%rsi), %xmm2 367; AVX-NEXT: vmovaps (%rdx), %xmm1 368; AVX-NEXT: vmovaps (%rcx), %xmm3 369; AVX-NEXT: vmovaps (%r8), %xmm4 370; AVX-NEXT: vmovaps (%r9), %xmm5 371; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6 372; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm7 373; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8 374; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm9 375; AVX-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] 376; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm11 377; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] 378; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5] 379; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm3[0,0],xmm1[0,0] 380; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] 381; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] 382; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] 383; AVX-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] 384; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm11 385; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm11[1,2],ymm7[5,6],ymm11[5,6] 386; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] 387; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] 388; AVX-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 389; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] 390; AVX-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] 391; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 392; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 393; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm2[3,3],ymm0[7,7],ymm2[7,7] 394; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6],ymm5[7] 395; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,7,5] 396; AVX-NEXT: vmovaps %ymm0, 64(%rax) 397; AVX-NEXT: vmovaps %ymm4, 32(%rax) 398; AVX-NEXT: vmovaps %ymm10, (%rax) 399; AVX-NEXT: vzeroupper 400; AVX-NEXT: retq 401; 402; AVX2-LABEL: store_i32_stride6_vf4: 403; AVX2: # %bb.0: 404; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 405; AVX2-NEXT: vmovaps (%rdi), %xmm0 406; AVX2-NEXT: vmovaps (%rsi), %xmm1 407; AVX2-NEXT: vmovaps (%rdx), %xmm2 408; AVX2-NEXT: vmovaps (%rcx), %xmm3 409; AVX2-NEXT: vmovaps (%r8), %xmm4 410; AVX2-NEXT: vmovaps (%r9), %xmm5 411; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 412; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7 413; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8 414; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm9 = [0,4,0,4,0,4,0,4] 415; AVX2-NEXT: vpermps %ymm7, %ymm9, %ymm10 416; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm11 = [0,4,1,5,0,4,1,5] 417; AVX2-NEXT: # ymm11 = mem[0,1,0,1] 418; AVX2-NEXT: vpermps %ymm6, %ymm11, %ymm11 419; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] 420; AVX2-NEXT: vpermps %ymm8, %ymm9, %ymm9 421; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5],ymm10[6,7] 422; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6] 423; AVX2-NEXT: vpermps %ymm6, %ymm10, %ymm6 424; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [1,5,2,6,1,5,2,6] 425; AVX2-NEXT: # ymm10 = mem[0,1,0,1] 426; AVX2-NEXT: vpermps %ymm7, %ymm10, %ymm7 427; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] 428; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 429; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] 430; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 431; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 432; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 433; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 434; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,7,2,6,3,7] 435; AVX2-NEXT: # ymm1 = mem[0,1,0,1] 436; AVX2-NEXT: vpermps %ymm8, %ymm1, %ymm1 437; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 438; AVX2-NEXT: vmovaps %ymm0, 64(%rax) 439; AVX2-NEXT: vmovaps %ymm4, 32(%rax) 440; AVX2-NEXT: vmovaps %ymm9, (%rax) 441; AVX2-NEXT: vzeroupper 442; AVX2-NEXT: retq 443; 444; AVX2-FP-LABEL: store_i32_stride6_vf4: 445; AVX2-FP: # %bb.0: 446; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 447; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0 448; AVX2-FP-NEXT: vmovaps (%rsi), %xmm1 449; AVX2-FP-NEXT: vmovaps (%rdx), %xmm2 450; AVX2-FP-NEXT: vmovaps (%rcx), %xmm3 451; AVX2-FP-NEXT: vmovaps (%r8), %xmm4 452; AVX2-FP-NEXT: vmovaps (%r9), %xmm5 453; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 454; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7 455; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8 456; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm9 = [0,4,0,4,0,4,0,4] 457; AVX2-FP-NEXT: vpermps %ymm7, %ymm9, %ymm10 458; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm11 = [0,4,1,5,0,4,1,5] 459; AVX2-FP-NEXT: # ymm11 = mem[0,1,0,1] 460; AVX2-FP-NEXT: vpermps %ymm6, %ymm11, %ymm11 461; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] 462; AVX2-FP-NEXT: vpermps %ymm8, %ymm9, %ymm9 463; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5],ymm10[6,7] 464; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6] 465; AVX2-FP-NEXT: vpermps %ymm6, %ymm10, %ymm6 466; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [1,5,2,6,1,5,2,6] 467; AVX2-FP-NEXT: # ymm10 = mem[0,1,0,1] 468; AVX2-FP-NEXT: vpermps %ymm7, %ymm10, %ymm7 469; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] 470; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 471; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] 472; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 473; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 474; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 475; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 476; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,7,2,6,3,7] 477; AVX2-FP-NEXT: # ymm1 = mem[0,1,0,1] 478; AVX2-FP-NEXT: vpermps %ymm8, %ymm1, %ymm1 479; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 480; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax) 481; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rax) 482; AVX2-FP-NEXT: vmovaps %ymm9, (%rax) 483; AVX2-FP-NEXT: vzeroupper 484; AVX2-FP-NEXT: retq 485; 486; AVX2-FCP-LABEL: store_i32_stride6_vf4: 487; AVX2-FCP: # %bb.0: 488; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 489; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0 490; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm1 491; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm2 492; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm3 493; AVX2-FCP-NEXT: vmovaps (%r8), %xmm4 494; AVX2-FCP-NEXT: vmovaps (%r9), %xmm5 495; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 496; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7 497; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8 498; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm9 = [0,4,0,4,0,4,0,4] 499; AVX2-FCP-NEXT: vpermps %ymm7, %ymm9, %ymm10 500; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm11 = [0,4,1,5,0,4,1,5] 501; AVX2-FCP-NEXT: # ymm11 = mem[0,1,0,1] 502; AVX2-FCP-NEXT: vpermps %ymm6, %ymm11, %ymm11 503; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] 504; AVX2-FCP-NEXT: vpermps %ymm8, %ymm9, %ymm9 505; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5],ymm10[6,7] 506; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6] 507; AVX2-FCP-NEXT: vpermps %ymm6, %ymm10, %ymm6 508; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [1,5,2,6,1,5,2,6] 509; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] 510; AVX2-FCP-NEXT: vpermps %ymm7, %ymm10, %ymm7 511; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] 512; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 513; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] 514; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 515; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 516; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 517; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 518; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,7,2,6,3,7] 519; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] 520; AVX2-FCP-NEXT: vpermps %ymm8, %ymm1, %ymm1 521; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 522; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax) 523; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rax) 524; AVX2-FCP-NEXT: vmovaps %ymm9, (%rax) 525; AVX2-FCP-NEXT: vzeroupper 526; AVX2-FCP-NEXT: retq 527; 528; AVX512-LABEL: store_i32_stride6_vf4: 529; AVX512: # %bb.0: 530; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 531; AVX512-NEXT: vmovdqa (%rdi), %xmm0 532; AVX512-NEXT: vmovdqa (%rdx), %xmm1 533; AVX512-NEXT: vmovdqa (%r8), %xmm2 534; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 535; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 536; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 537; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 538; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] 539; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 540; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] 541; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 542; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) 543; AVX512-NEXT: vmovdqa %ymm2, 64(%rax) 544; AVX512-NEXT: vzeroupper 545; AVX512-NEXT: retq 546; 547; AVX512-FCP-LABEL: store_i32_stride6_vf4: 548; AVX512-FCP: # %bb.0: 549; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 550; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 551; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 552; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 553; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 554; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 555; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 556; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 557; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] 558; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 559; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] 560; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 561; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) 562; AVX512-FCP-NEXT: vmovdqa %ymm2, 64(%rax) 563; AVX512-FCP-NEXT: vzeroupper 564; AVX512-FCP-NEXT: retq 565; 566; AVX512DQ-LABEL: store_i32_stride6_vf4: 567; AVX512DQ: # %bb.0: 568; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 569; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 570; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 571; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 572; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 573; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 574; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 575; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 576; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] 577; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 578; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] 579; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 580; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) 581; AVX512DQ-NEXT: vmovdqa %ymm2, 64(%rax) 582; AVX512DQ-NEXT: vzeroupper 583; AVX512DQ-NEXT: retq 584; 585; AVX512DQ-FCP-LABEL: store_i32_stride6_vf4: 586; AVX512DQ-FCP: # %bb.0: 587; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 588; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 589; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 590; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 591; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 592; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 593; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 594; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 595; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] 596; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 597; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] 598; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 599; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) 600; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 64(%rax) 601; AVX512DQ-FCP-NEXT: vzeroupper 602; AVX512DQ-FCP-NEXT: retq 603; 604; AVX512BW-LABEL: store_i32_stride6_vf4: 605; AVX512BW: # %bb.0: 606; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 607; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 608; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 609; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 610; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 611; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 612; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 613; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 614; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] 615; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 616; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] 617; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 618; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) 619; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rax) 620; AVX512BW-NEXT: vzeroupper 621; AVX512BW-NEXT: retq 622; 623; AVX512BW-FCP-LABEL: store_i32_stride6_vf4: 624; AVX512BW-FCP: # %bb.0: 625; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 626; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 627; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 628; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 629; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 630; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 631; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 632; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 633; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] 634; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 635; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] 636; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 637; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) 638; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax) 639; AVX512BW-FCP-NEXT: vzeroupper 640; AVX512BW-FCP-NEXT: retq 641; 642; AVX512DQ-BW-LABEL: store_i32_stride6_vf4: 643; AVX512DQ-BW: # %bb.0: 644; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 645; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 646; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 647; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 648; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 649; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 650; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 651; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 652; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] 653; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 654; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] 655; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 656; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) 657; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rax) 658; AVX512DQ-BW-NEXT: vzeroupper 659; AVX512DQ-BW-NEXT: retq 660; 661; AVX512DQ-BW-FCP-LABEL: store_i32_stride6_vf4: 662; AVX512DQ-BW-FCP: # %bb.0: 663; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 664; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 665; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 666; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 667; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 668; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 669; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 670; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 671; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] 672; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 673; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] 674; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 675; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) 676; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax) 677; AVX512DQ-BW-FCP-NEXT: vzeroupper 678; AVX512DQ-BW-FCP-NEXT: retq 679 %in.vec0 = load <4 x i32>, ptr %in.vecptr0, align 64 680 %in.vec1 = load <4 x i32>, ptr %in.vecptr1, align 64 681 %in.vec2 = load <4 x i32>, ptr %in.vecptr2, align 64 682 %in.vec3 = load <4 x i32>, ptr %in.vecptr3, align 64 683 %in.vec4 = load <4 x i32>, ptr %in.vecptr4, align 64 684 %in.vec5 = load <4 x i32>, ptr %in.vecptr5, align 64 685 %1 = shufflevector <4 x i32> %in.vec0, <4 x i32> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 686 %2 = shufflevector <4 x i32> %in.vec2, <4 x i32> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 687 %3 = shufflevector <4 x i32> %in.vec4, <4 x i32> %in.vec5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 688 %4 = shufflevector <8 x i32> %1, <8 x i32> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 689 %5 = shufflevector <8 x i32> %3, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 690 %6 = shufflevector <16 x i32> %4, <16 x i32> %5, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 691 %interleaved.vec = shufflevector <24 x i32> %6, <24 x i32> poison, <24 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23> 692 store <24 x i32> %interleaved.vec, ptr %out.vec, align 64 693 ret void 694} 695 696define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { 697; SSE-LABEL: store_i32_stride6_vf8: 698; SSE: # %bb.0: 699; SSE-NEXT: movaps (%rdi), %xmm4 700; SSE-NEXT: movaps 16(%rdi), %xmm1 701; SSE-NEXT: movaps (%rsi), %xmm0 702; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 703; SSE-NEXT: movaps 16(%rsi), %xmm10 704; SSE-NEXT: movaps (%rdx), %xmm8 705; SSE-NEXT: movaps 16(%rdx), %xmm2 706; SSE-NEXT: movaps (%rcx), %xmm6 707; SSE-NEXT: movaps 16(%rcx), %xmm9 708; SSE-NEXT: movaps (%r8), %xmm5 709; SSE-NEXT: movaps 16(%r8), %xmm11 710; SSE-NEXT: movaps (%r9), %xmm7 711; SSE-NEXT: movaps 16(%r9), %xmm3 712; SSE-NEXT: movaps %xmm9, %xmm14 713; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] 714; SSE-NEXT: movaps %xmm1, %xmm12 715; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] 716; SSE-NEXT: movaps %xmm11, %xmm13 717; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm3[1] 718; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm12[2,3] 719; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm14[2,0] 720; SSE-NEXT: movaps %xmm11, %xmm14 721; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1] 722; SSE-NEXT: movaps %xmm2, %xmm15 723; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] 724; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] 725; SSE-NEXT: movaps %xmm11, %xmm0 726; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] 727; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm11[0] 728; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] 729; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] 730; SSE-NEXT: movaps %xmm15, %xmm10 731; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm14[0,2] 732; SSE-NEXT: movaps %xmm5, %xmm14 733; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm7[3,3] 734; SSE-NEXT: movaps %xmm8, %xmm11 735; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] 736; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm14[0,2] 737; SSE-NEXT: movaps %xmm4, %xmm14 738; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 739; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] 740; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] 741; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] 742; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm0[0,2] 743; SSE-NEXT: movaps %xmm8, %xmm0 744; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 745; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] 746; SSE-NEXT: movaps %xmm5, %xmm8 747; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] 748; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm14[2,3] 749; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,0] 750; SSE-NEXT: movaps %xmm7, %xmm6 751; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] 752; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] 753; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] 754; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] 755; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm5[0,2] 756; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 757; SSE-NEXT: movaps %xmm0, 32(%rax) 758; SSE-NEXT: movaps %xmm14, 48(%rax) 759; SSE-NEXT: movaps %xmm1, 96(%rax) 760; SSE-NEXT: movaps %xmm3, 112(%rax) 761; SSE-NEXT: movaps %xmm13, 160(%rax) 762; SSE-NEXT: movaps %xmm2, 176(%rax) 763; SSE-NEXT: movaps %xmm4, (%rax) 764; SSE-NEXT: movaps %xmm6, 16(%rax) 765; SSE-NEXT: movaps %xmm8, 64(%rax) 766; SSE-NEXT: movaps %xmm11, 80(%rax) 767; SSE-NEXT: movaps %xmm10, 128(%rax) 768; SSE-NEXT: movaps %xmm12, 144(%rax) 769; SSE-NEXT: retq 770; 771; AVX-LABEL: store_i32_stride6_vf8: 772; AVX: # %bb.0: 773; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 774; AVX-NEXT: vmovaps (%rdi), %ymm7 775; AVX-NEXT: vmovaps (%rsi), %ymm8 776; AVX-NEXT: vmovaps (%rdx), %ymm2 777; AVX-NEXT: vmovaps (%rcx), %ymm3 778; AVX-NEXT: vmovaps (%r8), %ymm1 779; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] 780; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 781; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] 782; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 783; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] 784; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] 785; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 786; AVX-NEXT: vbroadcastss 16(%r9), %ymm4 787; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] 788; AVX-NEXT: vmovaps (%rcx), %xmm9 789; AVX-NEXT: vmovaps (%rdx), %xmm10 790; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,2],xmm9[1,2] 791; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2,1,3] 792; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 793; AVX-NEXT: vmovaps (%rsi), %xmm5 794; AVX-NEXT: vmovaps (%rdi), %xmm6 795; AVX-NEXT: vunpckhps {{.*#+}} xmm11 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] 796; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm12 797; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7] 798; AVX-NEXT: vbroadcastss 4(%r8), %xmm12 799; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5,6,7] 800; AVX-NEXT: vbroadcastss 4(%r9), %ymm12 801; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3],ymm4[4,5,6,7] 802; AVX-NEXT: vunpckhps {{.*#+}} ymm8 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] 803; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm2[1,2],ymm3[1,2],ymm2[5,6],ymm3[5,6] 804; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] 805; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7] 806; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] 807; AVX-NEXT: vbroadcastss 20(%r8), %xmm12 808; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3],ymm7[4,5,6,7] 809; AVX-NEXT: vbroadcastss 20(%r9), %ymm12 810; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3],ymm7[4,5,6,7] 811; AVX-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] 812; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] 813; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 814; AVX-NEXT: vpermilps {{.*#+}} xmm10 = mem[2,1,3,3] 815; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm10, %ymm10 816; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7] 817; AVX-NEXT: vmovaps (%r9), %xmm10 818; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm10[0,2,2,3] 819; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 820; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5,6],ymm10[7] 821; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] 822; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] 823; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] 824; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] 825; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] 826; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7] 827; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3] 828; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 829; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] 830; AVX-NEXT: vbroadcastss (%rcx), %xmm2 831; AVX-NEXT: vbroadcastss (%rdx), %xmm3 832; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 833; AVX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 834; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5 835; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7] 836; AVX-NEXT: vinsertf128 $1, (%r8), %ymm3, %ymm3 837; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] 838; AVX-NEXT: vbroadcastss (%r9), %ymm3 839; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] 840; AVX-NEXT: vmovaps %ymm2, (%rax) 841; AVX-NEXT: vmovaps %ymm1, 160(%rax) 842; AVX-NEXT: vmovaps %ymm9, 64(%rax) 843; AVX-NEXT: vmovaps %ymm7, 128(%rax) 844; AVX-NEXT: vmovaps %ymm4, 32(%rax) 845; AVX-NEXT: vmovaps %ymm0, 96(%rax) 846; AVX-NEXT: vzeroupper 847; AVX-NEXT: retq 848; 849; AVX2-LABEL: store_i32_stride6_vf8: 850; AVX2: # %bb.0: 851; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 852; AVX2-NEXT: vmovdqa (%rdi), %ymm0 853; AVX2-NEXT: vmovdqa (%rsi), %ymm1 854; AVX2-NEXT: vmovdqa (%rdx), %ymm3 855; AVX2-NEXT: vmovdqa (%rcx), %ymm4 856; AVX2-NEXT: vmovdqa (%r8), %ymm2 857; AVX2-NEXT: vmovdqa (%rsi), %xmm6 858; AVX2-NEXT: vmovdqa (%rdi), %xmm11 859; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] 860; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm5 861; AVX2-NEXT: vmovdqa (%rcx), %xmm8 862; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,2,2,3] 863; AVX2-NEXT: vmovdqa (%rdx), %xmm9 864; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] 865; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] 866; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] 867; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7] 868; AVX2-NEXT: vmovdqa (%r8), %xmm10 869; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm10[0],zero,xmm10[1],zero 870; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] 871; AVX2-NEXT: vpbroadcastd 4(%r9), %ymm12 872; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] 873; AVX2-NEXT: vpbroadcastd (%rcx), %xmm12 874; AVX2-NEXT: vpbroadcastd (%rdx), %xmm13 875; AVX2-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] 876; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] 877; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] 878; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3],ymm6[4,5,6,7] 879; AVX2-NEXT: vpbroadcastq %xmm10, %ymm11 880; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] 881; AVX2-NEXT: vmovdqa (%r9), %xmm12 882; AVX2-NEXT: vpbroadcastd %xmm12, %ymm11 883; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm11[5],ymm6[6,7] 884; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 885; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] 886; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,2,3,5,5,6,7] 887; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4],ymm11[5],ymm14[6],ymm11[7] 888; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] 889; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] 890; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero 891; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] 892; AVX2-NEXT: vpbroadcastd 20(%r9), %ymm14 893; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] 894; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] 895; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] 896; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 897; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] 898; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] 899; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7] 900; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] 901; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] 902; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] 903; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] 904; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] 905; AVX2-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3] 906; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,1,3,3,6,5,7,7] 907; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] 908; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] 909; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = mem[0,2,2,3,4,6,6,7] 910; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] 911; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] 912; AVX2-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] 913; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] 914; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 915; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 916; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] 917; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] 918; AVX2-NEXT: vpbroadcastd 16(%r9), %ymm1 919; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 920; AVX2-NEXT: vmovdqa %ymm0, 96(%rax) 921; AVX2-NEXT: vmovdqa %ymm8, 160(%rax) 922; AVX2-NEXT: vmovdqa %ymm7, 64(%rax) 923; AVX2-NEXT: vmovdqa %ymm11, 128(%rax) 924; AVX2-NEXT: vmovdqa %ymm6, (%rax) 925; AVX2-NEXT: vmovdqa %ymm5, 32(%rax) 926; AVX2-NEXT: vzeroupper 927; AVX2-NEXT: retq 928; 929; AVX2-FP-LABEL: store_i32_stride6_vf8: 930; AVX2-FP: # %bb.0: 931; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 932; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 933; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm1 934; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm3 935; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm4 936; AVX2-FP-NEXT: vmovdqa (%r8), %ymm2 937; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm6 938; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm11 939; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] 940; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm5 941; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm8 942; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,2,2,3] 943; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm9 944; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] 945; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] 946; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] 947; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7] 948; AVX2-FP-NEXT: vmovdqa (%r8), %xmm10 949; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm10[0],zero,xmm10[1],zero 950; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] 951; AVX2-FP-NEXT: vpbroadcastd 4(%r9), %ymm12 952; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] 953; AVX2-FP-NEXT: vpbroadcastd (%rcx), %xmm12 954; AVX2-FP-NEXT: vpbroadcastd (%rdx), %xmm13 955; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] 956; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] 957; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] 958; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3],ymm6[4,5,6,7] 959; AVX2-FP-NEXT: vpbroadcastq %xmm10, %ymm11 960; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] 961; AVX2-FP-NEXT: vmovdqa (%r9), %xmm12 962; AVX2-FP-NEXT: vpbroadcastd %xmm12, %ymm11 963; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm11[5],ymm6[6,7] 964; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 965; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] 966; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,2,3,5,5,6,7] 967; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4],ymm11[5],ymm14[6],ymm11[7] 968; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] 969; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] 970; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero 971; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] 972; AVX2-FP-NEXT: vpbroadcastd 20(%r9), %ymm14 973; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] 974; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] 975; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] 976; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 977; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] 978; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] 979; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7] 980; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] 981; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] 982; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] 983; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] 984; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] 985; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3] 986; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,1,3,3,6,5,7,7] 987; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] 988; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] 989; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = mem[0,2,2,3,4,6,6,7] 990; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] 991; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] 992; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] 993; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] 994; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 995; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 996; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] 997; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] 998; AVX2-FP-NEXT: vpbroadcastd 16(%r9), %ymm1 999; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 1000; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax) 1001; AVX2-FP-NEXT: vmovdqa %ymm8, 160(%rax) 1002; AVX2-FP-NEXT: vmovdqa %ymm7, 64(%rax) 1003; AVX2-FP-NEXT: vmovdqa %ymm11, 128(%rax) 1004; AVX2-FP-NEXT: vmovdqa %ymm6, (%rax) 1005; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%rax) 1006; AVX2-FP-NEXT: vzeroupper 1007; AVX2-FP-NEXT: retq 1008; 1009; AVX2-FCP-LABEL: store_i32_stride6_vf8: 1010; AVX2-FCP: # %bb.0: 1011; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1012; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 1013; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm1 1014; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm3 1015; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm4 1016; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm2 1017; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm5 1018; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm7 1019; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm11 1020; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] 1021; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm6 1022; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm9 1023; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] 1024; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm10 1025; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[1,2,2,3] 1026; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] 1027; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] 1028; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7] 1029; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm12 1030; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero 1031; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] 1032; AVX2-FCP-NEXT: vpbroadcastd 4(%r9), %ymm13 1033; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3],ymm6[4,5,6,7] 1034; AVX2-FCP-NEXT: vpbroadcastd (%rcx), %xmm13 1035; AVX2-FCP-NEXT: vpbroadcastd (%rdx), %xmm14 1036; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] 1037; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] 1038; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] 1039; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm13[2,3],ymm7[4,5,6,7] 1040; AVX2-FCP-NEXT: vpbroadcastq %xmm12, %ymm11 1041; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] 1042; AVX2-FCP-NEXT: vpbroadcastd (%r9), %ymm11 1043; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5],ymm7[6,7] 1044; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 1045; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6] 1046; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[1,1,2,3,5,5,6,7] 1047; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7] 1048; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] 1049; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] 1050; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero 1051; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] 1052; AVX2-FCP-NEXT: vpbroadcastd 20(%r9), %ymm13 1053; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] 1054; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] 1055; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,3,2,3] 1056; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm8 1057; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] 1058; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] 1059; AVX2-FCP-NEXT: vpermd %ymm2, %ymm9, %ymm10 1060; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3,4,5],ymm10[6,7] 1061; AVX2-FCP-NEXT: vpermd %ymm5, %ymm9, %ymm9 1062; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] 1063; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] 1064; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] 1065; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] 1066; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [6,0,0,7] 1067; AVX2-FCP-NEXT: vpermd %ymm2, %ymm10, %ymm10 1068; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7] 1069; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,6,0,0,0,0,0,7] 1070; AVX2-FCP-NEXT: vpermd %ymm5, %ymm10, %ymm5 1071; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3,4,5,6],ymm5[7] 1072; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] 1073; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] 1074; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 1075; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 1076; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] 1077; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] 1078; AVX2-FCP-NEXT: vpbroadcastd 16(%r9), %ymm1 1079; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 1080; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax) 1081; AVX2-FCP-NEXT: vmovdqa %ymm5, 160(%rax) 1082; AVX2-FCP-NEXT: vmovdqa %ymm8, 64(%rax) 1083; AVX2-FCP-NEXT: vmovdqa %ymm12, 128(%rax) 1084; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rax) 1085; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%rax) 1086; AVX2-FCP-NEXT: vzeroupper 1087; AVX2-FCP-NEXT: retq 1088; 1089; AVX512-LABEL: store_i32_stride6_vf8: 1090; AVX512: # %bb.0: 1091; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 1092; AVX512-NEXT: vmovdqa (%rdi), %ymm0 1093; AVX512-NEXT: vmovdqa (%rdx), %ymm1 1094; AVX512-NEXT: vmovdqa (%r8), %ymm2 1095; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 1096; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 1097; AVX512-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 1098; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] 1099; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1100; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] 1101; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 1102; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] 1103; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1104; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] 1105; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 1106; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] 1107; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 1108; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] 1109; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 1110; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) 1111; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rax) 1112; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) 1113; AVX512-NEXT: vzeroupper 1114; AVX512-NEXT: retq 1115; 1116; AVX512-FCP-LABEL: store_i32_stride6_vf8: 1117; AVX512-FCP: # %bb.0: 1118; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1119; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 1120; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm1 1121; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm2 1122; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 1123; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 1124; AVX512-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 1125; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] 1126; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1127; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] 1128; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 1129; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] 1130; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1131; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] 1132; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 1133; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] 1134; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 1135; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] 1136; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 1137; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) 1138; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) 1139; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 1140; AVX512-FCP-NEXT: vzeroupper 1141; AVX512-FCP-NEXT: retq 1142; 1143; AVX512DQ-LABEL: store_i32_stride6_vf8: 1144; AVX512DQ: # %bb.0: 1145; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 1146; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 1147; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 1148; AVX512DQ-NEXT: vmovdqa (%r8), %ymm2 1149; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 1150; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 1151; AVX512DQ-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 1152; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] 1153; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1154; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] 1155; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 1156; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] 1157; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1158; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] 1159; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 1160; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] 1161; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 1162; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] 1163; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 1164; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) 1165; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rax) 1166; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) 1167; AVX512DQ-NEXT: vzeroupper 1168; AVX512DQ-NEXT: retq 1169; 1170; AVX512DQ-FCP-LABEL: store_i32_stride6_vf8: 1171; AVX512DQ-FCP: # %bb.0: 1172; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1173; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 1174; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm1 1175; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm2 1176; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 1177; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 1178; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 1179; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] 1180; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1181; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] 1182; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 1183; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] 1184; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1185; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] 1186; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 1187; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] 1188; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 1189; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] 1190; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 1191; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) 1192; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) 1193; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 1194; AVX512DQ-FCP-NEXT: vzeroupper 1195; AVX512DQ-FCP-NEXT: retq 1196; 1197; AVX512BW-LABEL: store_i32_stride6_vf8: 1198; AVX512BW: # %bb.0: 1199; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1200; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 1201; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 1202; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 1203; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 1204; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 1205; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 1206; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] 1207; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1208; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] 1209; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 1210; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] 1211; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1212; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] 1213; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 1214; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] 1215; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 1216; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] 1217; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 1218; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) 1219; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) 1220; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) 1221; AVX512BW-NEXT: vzeroupper 1222; AVX512BW-NEXT: retq 1223; 1224; AVX512BW-FCP-LABEL: store_i32_stride6_vf8: 1225; AVX512BW-FCP: # %bb.0: 1226; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1227; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 1228; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 1229; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm2 1230; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 1231; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 1232; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 1233; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] 1234; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1235; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] 1236; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 1237; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] 1238; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1239; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] 1240; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 1241; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] 1242; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 1243; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] 1244; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 1245; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) 1246; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) 1247; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 1248; AVX512BW-FCP-NEXT: vzeroupper 1249; AVX512BW-FCP-NEXT: retq 1250; 1251; AVX512DQ-BW-LABEL: store_i32_stride6_vf8: 1252; AVX512DQ-BW: # %bb.0: 1253; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1254; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 1255; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 1256; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm2 1257; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 1258; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 1259; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 1260; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] 1261; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1262; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] 1263; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 1264; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] 1265; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1266; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] 1267; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 1268; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] 1269; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 1270; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] 1271; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 1272; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) 1273; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rax) 1274; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rax) 1275; AVX512DQ-BW-NEXT: vzeroupper 1276; AVX512DQ-BW-NEXT: retq 1277; 1278; AVX512DQ-BW-FCP-LABEL: store_i32_stride6_vf8: 1279; AVX512DQ-BW-FCP: # %bb.0: 1280; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1281; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 1282; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 1283; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm2 1284; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 1285; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 1286; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 1287; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] 1288; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1289; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] 1290; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 1291; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] 1292; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1293; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] 1294; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 1295; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] 1296; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 1297; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] 1298; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 1299; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) 1300; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) 1301; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 1302; AVX512DQ-BW-FCP-NEXT: vzeroupper 1303; AVX512DQ-BW-FCP-NEXT: retq 1304 %in.vec0 = load <8 x i32>, ptr %in.vecptr0, align 64 1305 %in.vec1 = load <8 x i32>, ptr %in.vecptr1, align 64 1306 %in.vec2 = load <8 x i32>, ptr %in.vecptr2, align 64 1307 %in.vec3 = load <8 x i32>, ptr %in.vecptr3, align 64 1308 %in.vec4 = load <8 x i32>, ptr %in.vecptr4, align 64 1309 %in.vec5 = load <8 x i32>, ptr %in.vecptr5, align 64 1310 %1 = shufflevector <8 x i32> %in.vec0, <8 x i32> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1311 %2 = shufflevector <8 x i32> %in.vec2, <8 x i32> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1312 %3 = shufflevector <8 x i32> %in.vec4, <8 x i32> %in.vec5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1313 %4 = shufflevector <16 x i32> %1, <16 x i32> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1314 %5 = shufflevector <16 x i32> %3, <16 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1315 %6 = shufflevector <32 x i32> %4, <32 x i32> %5, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 1316 %interleaved.vec = shufflevector <48 x i32> %6, <48 x i32> poison, <48 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 7, i32 15, i32 23, i32 31, i32 39, i32 47> 1317 store <48 x i32> %interleaved.vec, ptr %out.vec, align 64 1318 ret void 1319} 1320 1321define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { 1322; SSE-LABEL: store_i32_stride6_vf16: 1323; SSE: # %bb.0: 1324; SSE-NEXT: subq $72, %rsp 1325; SSE-NEXT: movaps (%rdi), %xmm7 1326; SSE-NEXT: movaps 16(%rdi), %xmm8 1327; SSE-NEXT: movaps (%rsi), %xmm2 1328; SSE-NEXT: movaps 16(%rsi), %xmm6 1329; SSE-NEXT: movaps (%rdx), %xmm9 1330; SSE-NEXT: movaps 16(%rdx), %xmm10 1331; SSE-NEXT: movaps (%rcx), %xmm1 1332; SSE-NEXT: movaps 16(%rcx), %xmm0 1333; SSE-NEXT: movaps (%r8), %xmm3 1334; SSE-NEXT: movaps 16(%r8), %xmm14 1335; SSE-NEXT: movaps (%r9), %xmm4 1336; SSE-NEXT: movaps 16(%r9), %xmm13 1337; SSE-NEXT: movaps %xmm9, %xmm11 1338; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] 1339; SSE-NEXT: movaps %xmm7, %xmm5 1340; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] 1341; SSE-NEXT: movaps %xmm4, %xmm12 1342; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm3[0] 1343; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm5[2,3] 1344; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1345; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm11[0] 1346; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1347; SSE-NEXT: movaps %xmm3, %xmm5 1348; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1] 1349; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[0,2] 1350; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1351; SSE-NEXT: movaps %xmm1, %xmm5 1352; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] 1353; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] 1354; SSE-NEXT: movaps %xmm3, %xmm2 1355; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] 1356; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[2,3] 1357; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill 1358; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,0] 1359; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1360; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm4[3,3] 1361; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] 1362; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm3[0,2] 1363; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1364; SSE-NEXT: movaps %xmm10, %xmm2 1365; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 1366; SSE-NEXT: movaps %xmm8, %xmm1 1367; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] 1368; SSE-NEXT: movaps %xmm13, %xmm3 1369; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm14[0] 1370; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] 1371; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1372; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1373; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1374; SSE-NEXT: movaps %xmm14, %xmm1 1375; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm13[1,1] 1376; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[0,2] 1377; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1378; SSE-NEXT: movaps %xmm0, %xmm1 1379; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] 1380; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] 1381; SSE-NEXT: movaps %xmm14, %xmm2 1382; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] 1383; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[2,3] 1384; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1385; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,0] 1386; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1387; SSE-NEXT: movaps 32(%rdi), %xmm12 1388; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm13[3,3] 1389; SSE-NEXT: movaps 32(%rdx), %xmm13 1390; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] 1391; SSE-NEXT: movaps 32(%rcx), %xmm0 1392; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm14[0,2] 1393; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1394; SSE-NEXT: movaps %xmm13, %xmm14 1395; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] 1396; SSE-NEXT: movaps 32(%rsi), %xmm1 1397; SSE-NEXT: movaps %xmm12, %xmm15 1398; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] 1399; SSE-NEXT: movaps 32(%r8), %xmm2 1400; SSE-NEXT: movaps 32(%r9), %xmm3 1401; SSE-NEXT: movaps %xmm3, %xmm11 1402; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] 1403; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm15[2,3] 1404; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] 1405; SSE-NEXT: movaps %xmm2, %xmm4 1406; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 1407; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2] 1408; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] 1409; SSE-NEXT: movaps %xmm0, %xmm1 1410; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] 1411; SSE-NEXT: movaps %xmm2, %xmm8 1412; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] 1413; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm12[2,3] 1414; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[2,0] 1415; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 1416; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] 1417; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm2[0,2] 1418; SSE-NEXT: movaps 48(%rdx), %xmm3 1419; SSE-NEXT: movaps 48(%rcx), %xmm10 1420; SSE-NEXT: movaps %xmm3, %xmm5 1421; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] 1422; SSE-NEXT: movaps 48(%rdi), %xmm2 1423; SSE-NEXT: movaps 48(%rsi), %xmm9 1424; SSE-NEXT: movaps %xmm2, %xmm4 1425; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] 1426; SSE-NEXT: movaps 48(%r8), %xmm1 1427; SSE-NEXT: movaps 48(%r9), %xmm7 1428; SSE-NEXT: movaps %xmm7, %xmm6 1429; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] 1430; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] 1431; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 1432; SSE-NEXT: movaps %xmm1, %xmm0 1433; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] 1434; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] 1435; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] 1436; SSE-NEXT: movaps %xmm10, %xmm0 1437; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] 1438; SSE-NEXT: movaps %xmm1, %xmm9 1439; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm7[1] 1440; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[2,3] 1441; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] 1442; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] 1443; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] 1444; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] 1445; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 1446; SSE-NEXT: movaps %xmm3, 368(%rax) 1447; SSE-NEXT: movaps %xmm9, 352(%rax) 1448; SSE-NEXT: movaps %xmm2, 336(%rax) 1449; SSE-NEXT: movaps %xmm5, 320(%rax) 1450; SSE-NEXT: movaps %xmm6, 304(%rax) 1451; SSE-NEXT: movaps %xmm4, 288(%rax) 1452; SSE-NEXT: movaps %xmm13, 272(%rax) 1453; SSE-NEXT: movaps %xmm8, 256(%rax) 1454; SSE-NEXT: movaps %xmm12, 240(%rax) 1455; SSE-NEXT: movaps %xmm14, 224(%rax) 1456; SSE-NEXT: movaps %xmm11, 208(%rax) 1457; SSE-NEXT: movaps %xmm15, 192(%rax) 1458; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1459; SSE-NEXT: movaps %xmm0, 176(%rax) 1460; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1461; SSE-NEXT: movaps %xmm0, 160(%rax) 1462; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1463; SSE-NEXT: movaps %xmm0, 144(%rax) 1464; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1465; SSE-NEXT: movaps %xmm0, 128(%rax) 1466; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1467; SSE-NEXT: movaps %xmm0, 112(%rax) 1468; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1469; SSE-NEXT: movaps %xmm0, 96(%rax) 1470; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1471; SSE-NEXT: movaps %xmm0, 80(%rax) 1472; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 1473; SSE-NEXT: movaps %xmm0, 64(%rax) 1474; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1475; SSE-NEXT: movaps %xmm0, 48(%rax) 1476; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1477; SSE-NEXT: movaps %xmm0, 32(%rax) 1478; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1479; SSE-NEXT: movaps %xmm0, 16(%rax) 1480; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1481; SSE-NEXT: movaps %xmm0, (%rax) 1482; SSE-NEXT: addq $72, %rsp 1483; SSE-NEXT: retq 1484; 1485; AVX-LABEL: store_i32_stride6_vf16: 1486; AVX: # %bb.0: 1487; AVX-NEXT: subq $104, %rsp 1488; AVX-NEXT: vmovaps 32(%rdi), %ymm5 1489; AVX-NEXT: vmovaps 32(%rsi), %ymm13 1490; AVX-NEXT: vmovaps 32(%rdx), %ymm7 1491; AVX-NEXT: vmovaps 32(%rcx), %ymm9 1492; AVX-NEXT: vmovaps 32(%r8), %ymm11 1493; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1494; AVX-NEXT: vmovaps (%rcx), %xmm8 1495; AVX-NEXT: vmovaps 32(%rcx), %xmm3 1496; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1497; AVX-NEXT: vmovaps (%rdx), %xmm6 1498; AVX-NEXT: vmovaps 32(%rdx), %xmm10 1499; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1500; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,2],xmm8[1,2] 1501; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 1502; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1503; AVX-NEXT: vmovaps (%rsi), %xmm1 1504; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1505; AVX-NEXT: vmovaps (%rdi), %xmm2 1506; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1507; AVX-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1508; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 1509; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 1510; AVX-NEXT: vbroadcastss 4(%r8), %xmm1 1511; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 1512; AVX-NEXT: vbroadcastss 4(%r9), %ymm1 1513; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 1514; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1515; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[1],ymm13[1],ymm5[4],ymm13[4],ymm5[5],ymm13[5] 1516; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 1517; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] 1518; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1519; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] 1520; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 1521; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] 1522; AVX-NEXT: vbroadcastss 48(%r9), %ymm1 1523; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 1524; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1525; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,2],xmm3[1,2] 1526; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 1527; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1528; AVX-NEXT: vmovaps 32(%rsi), %xmm3 1529; AVX-NEXT: vmovaps 32(%rdi), %xmm2 1530; AVX-NEXT: vunpckhps {{.*#+}} xmm15 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] 1531; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm11 1532; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] 1533; AVX-NEXT: vbroadcastss 36(%r8), %xmm11 1534; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3],ymm0[4,5,6,7] 1535; AVX-NEXT: vbroadcastss 36(%r9), %ymm11 1536; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5,6,7] 1537; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 1538; AVX-NEXT: vmovaps (%rdi), %ymm1 1539; AVX-NEXT: vmovaps (%rsi), %ymm0 1540; AVX-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 1541; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm11[2,3,2,3] 1542; AVX-NEXT: vmovaps (%rdx), %ymm11 1543; AVX-NEXT: vmovaps (%rcx), %ymm12 1544; AVX-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] 1545; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10 1546; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,0] 1547; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3],ymm14[4,5,6,7] 1548; AVX-NEXT: vmovaps (%r8), %ymm14 1549; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1550; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5],ymm10[6,7] 1551; AVX-NEXT: vbroadcastss 16(%r9), %ymm14 1552; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm10[0,1,2,3,4],ymm14[5],ymm10[6,7] 1553; AVX-NEXT: vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[6],ymm13[6],ymm5[7],ymm13[7] 1554; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,2],ymm9[1,2],ymm7[5,6],ymm9[5,6] 1555; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] 1556; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,1,3,4,6,5,7] 1557; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5],ymm5[6,7] 1558; AVX-NEXT: vbroadcastss 52(%r8), %xmm10 1559; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] 1560; AVX-NEXT: vbroadcastss 52(%r9), %ymm10 1561; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm10[3],ymm5[4,5,6,7] 1562; AVX-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] 1563; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,2],ymm12[1,2],ymm11[5,6],ymm12[5,6] 1564; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 1565; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] 1566; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] 1567; AVX-NEXT: vbroadcastss 20(%r8), %xmm1 1568; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 1569; AVX-NEXT: vbroadcastss 20(%r9), %ymm1 1570; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 1571; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] 1572; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 1573; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 1574; AVX-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,1,3,3] 1575; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 1576; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] 1577; AVX-NEXT: vmovaps (%r9), %xmm4 1578; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm4[0,2,2,3] 1579; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 1580; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6],ymm4[7] 1581; AVX-NEXT: vbroadcastss 32(%rcx), %xmm0 1582; AVX-NEXT: vbroadcastss 32(%rdx), %xmm6 1583; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] 1584; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1585; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 1586; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] 1587; AVX-NEXT: vinsertf128 $1, 32(%r8), %ymm2, %ymm2 1588; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] 1589; AVX-NEXT: vbroadcastss 32(%r9), %ymm2 1590; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] 1591; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm9[3,0],ymm7[3,0],ymm9[7,4],ymm7[7,4] 1592; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 1593; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] 1594; AVX-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 1595; AVX-NEXT: # ymm3 = mem[2,3,2,3] 1596; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] 1597; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] 1598; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3] 1599; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 1600; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6],ymm3[7] 1601; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 1602; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 1603; AVX-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] 1604; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] 1605; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm15, %ymm3 1606; AVX-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,1,3,3] 1607; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm6, %ymm6 1608; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5],ymm6[6,7] 1609; AVX-NEXT: vmovaps 32(%r9), %xmm6 1610; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,2,2,3] 1611; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 1612; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4,5,6],ymm6[7] 1613; AVX-NEXT: vbroadcastss (%rcx), %xmm6 1614; AVX-NEXT: vbroadcastss (%rdx), %xmm7 1615; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] 1616; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 1617; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload 1618; AVX-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1] 1619; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm8 1620; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] 1621; AVX-NEXT: vinsertf128 $1, (%r8), %ymm7, %ymm7 1622; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] 1623; AVX-NEXT: vbroadcastss (%r9), %ymm7 1624; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] 1625; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm12[3,0],ymm11[3,0],ymm12[7,4],ymm11[7,4] 1626; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] 1627; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] 1628; AVX-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload 1629; AVX-NEXT: # ymm7 = mem[2,3,2,3] 1630; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] 1631; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3,4,5],ymm7[6,7] 1632; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3] 1633; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] 1634; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5,6],ymm7[7] 1635; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 1636; AVX-NEXT: vmovaps %ymm5, 160(%rax) 1637; AVX-NEXT: vmovaps %ymm6, (%rax) 1638; AVX-NEXT: vmovaps %ymm3, 256(%rax) 1639; AVX-NEXT: vmovaps %ymm0, 352(%rax) 1640; AVX-NEXT: vmovaps %ymm2, 192(%rax) 1641; AVX-NEXT: vmovaps %ymm4, 64(%rax) 1642; AVX-NEXT: vmovaps %ymm1, 128(%rax) 1643; AVX-NEXT: vmovaps %ymm10, 320(%rax) 1644; AVX-NEXT: vmovaps %ymm14, 96(%rax) 1645; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 1646; AVX-NEXT: vmovaps %ymm0, 224(%rax) 1647; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1648; AVX-NEXT: vmovaps %ymm0, 288(%rax) 1649; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1650; AVX-NEXT: vmovaps %ymm0, 32(%rax) 1651; AVX-NEXT: addq $104, %rsp 1652; AVX-NEXT: vzeroupper 1653; AVX-NEXT: retq 1654; 1655; AVX2-LABEL: store_i32_stride6_vf16: 1656; AVX2: # %bb.0: 1657; AVX2-NEXT: subq $200, %rsp 1658; AVX2-NEXT: vmovdqa (%rsi), %xmm12 1659; AVX2-NEXT: vmovdqa 32(%rsi), %xmm1 1660; AVX2-NEXT: vmovdqa (%rdi), %xmm0 1661; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 1662; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] 1663; AVX2-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill 1664; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 1665; AVX2-NEXT: vmovdqa (%rcx), %xmm4 1666; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1667; AVX2-NEXT: vmovdqa 32(%rcx), %xmm7 1668; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1669; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] 1670; AVX2-NEXT: vmovdqa (%rdx), %xmm5 1671; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1672; AVX2-NEXT: vmovdqa 32(%rdx), %xmm8 1673; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1674; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] 1675; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 1676; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] 1677; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] 1678; AVX2-NEXT: vmovdqa (%r8), %xmm13 1679; AVX2-NEXT: vmovdqa 32(%r8), %xmm6 1680; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero 1681; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] 1682; AVX2-NEXT: vpbroadcastd 4(%r9), %ymm4 1683; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] 1684; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1685; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1686; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1687; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 1688; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] 1689; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,2,2,3] 1690; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 1691; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] 1692; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] 1693; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero 1694; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] 1695; AVX2-NEXT: vpbroadcastd 36(%r9), %ymm4 1696; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] 1697; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1698; AVX2-NEXT: vpbroadcastd 32(%rcx), %xmm3 1699; AVX2-NEXT: vpbroadcastd 32(%rdx), %xmm4 1700; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 1701; AVX2-NEXT: vmovdqa 32(%rdx), %ymm9 1702; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1703; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] 1704; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] 1705; AVX2-NEXT: vpbroadcastq %xmm6, %ymm2 1706; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] 1707; AVX2-NEXT: vmovdqa 32(%r9), %xmm15 1708; AVX2-NEXT: vpbroadcastd %xmm15, %ymm2 1709; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] 1710; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1711; AVX2-NEXT: vmovdqa 32(%rcx), %ymm14 1712; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6] 1713; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,2,3,5,5,6,7] 1714; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] 1715; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 1716; AVX2-NEXT: vmovdqa 32(%rdi), %ymm10 1717; AVX2-NEXT: vmovdqa 32(%rsi), %ymm8 1718; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] 1719; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] 1720; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero 1721; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] 1722; AVX2-NEXT: vpbroadcastd 52(%r9), %ymm3 1723; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] 1724; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1725; AVX2-NEXT: vpbroadcastd (%rcx), %xmm1 1726; AVX2-NEXT: vpbroadcastd (%rdx), %xmm3 1727; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 1728; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] 1729; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] 1730; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 1731; AVX2-NEXT: vpbroadcastq %xmm13, %ymm1 1732; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 1733; AVX2-NEXT: vmovdqa (%r9), %xmm1 1734; AVX2-NEXT: vpbroadcastd %xmm1, %ymm2 1735; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] 1736; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1737; AVX2-NEXT: vmovdqa (%rdx), %ymm2 1738; AVX2-NEXT: vmovdqa (%rcx), %ymm0 1739; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,2,4,5,6,6] 1740; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] 1741; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] 1742; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,1,2,3] 1743; AVX2-NEXT: vmovdqa (%rdi), %ymm7 1744; AVX2-NEXT: vmovdqa (%rsi), %ymm5 1745; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] 1746; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] 1747; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero 1748; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] 1749; AVX2-NEXT: vpbroadcastd 20(%r9), %ymm12 1750; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] 1751; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1752; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 1753; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload 1754; AVX2-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] 1755; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] 1756; AVX2-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload 1757; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 1758; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] 1759; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] 1760; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5],ymm13[6,7] 1761; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 1762; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] 1763; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4,5,6],ymm1[7] 1764; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7] 1765; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7] 1766; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] 1767; AVX2-NEXT: vmovdqa 32(%r8), %ymm11 1768; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[2,1,3,3,6,5,7,7] 1769; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] 1770; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7] 1771; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] 1772; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] 1773; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] 1774; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 1775; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload 1776; AVX2-NEXT: # xmm13 = xmm12[2],mem[2],xmm12[3],mem[3] 1777; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] 1778; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 1779; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm13 1780; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] 1781; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] 1782; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5],ymm6[6,7] 1783; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[2,2,3,3] 1784; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] 1785; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7] 1786; AVX2-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[4],ymm14[4],ymm9[5],ymm14[5] 1787; AVX2-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] 1788; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] 1789; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] 1790; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] 1791; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] 1792; AVX2-NEXT: vpbroadcastd 48(%r9), %ymm9 1793; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] 1794; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] 1795; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] 1796; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm9[2,3] 1797; AVX2-NEXT: vmovdqa (%r8), %ymm9 1798; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,3,3,6,5,7,7] 1799; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] 1800; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7] 1801; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7] 1802; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] 1803; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] 1804; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] 1805; AVX2-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] 1806; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 1807; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 1808; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 1809; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] 1810; AVX2-NEXT: vpbroadcastd 16(%r9), %ymm2 1811; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] 1812; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 1813; AVX2-NEXT: vmovdqa %ymm0, 96(%rax) 1814; AVX2-NEXT: vmovdqa %ymm3, 160(%rax) 1815; AVX2-NEXT: vmovdqa %ymm8, 288(%rax) 1816; AVX2-NEXT: vmovdqa %ymm6, 256(%rax) 1817; AVX2-NEXT: vmovdqa %ymm4, 352(%rax) 1818; AVX2-NEXT: vmovdqa %ymm1, 64(%rax) 1819; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1820; AVX2-NEXT: vmovaps %ymm0, 128(%rax) 1821; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1822; AVX2-NEXT: vmovaps %ymm0, (%rax) 1823; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1824; AVX2-NEXT: vmovaps %ymm0, 320(%rax) 1825; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1826; AVX2-NEXT: vmovaps %ymm0, 192(%rax) 1827; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1828; AVX2-NEXT: vmovaps %ymm0, 224(%rax) 1829; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1830; AVX2-NEXT: vmovaps %ymm0, 32(%rax) 1831; AVX2-NEXT: addq $200, %rsp 1832; AVX2-NEXT: vzeroupper 1833; AVX2-NEXT: retq 1834; 1835; AVX2-FP-LABEL: store_i32_stride6_vf16: 1836; AVX2-FP: # %bb.0: 1837; AVX2-FP-NEXT: subq $200, %rsp 1838; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm12 1839; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm1 1840; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 1841; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2 1842; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] 1843; AVX2-FP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill 1844; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 1845; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm4 1846; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1847; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm7 1848; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1849; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] 1850; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm5 1851; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1852; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm8 1853; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1854; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] 1855; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 1856; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] 1857; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] 1858; AVX2-FP-NEXT: vmovdqa (%r8), %xmm13 1859; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm6 1860; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero 1861; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] 1862; AVX2-FP-NEXT: vpbroadcastd 4(%r9), %ymm4 1863; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] 1864; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1865; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1866; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1867; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 1868; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] 1869; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,2,2,3] 1870; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 1871; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] 1872; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] 1873; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero 1874; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] 1875; AVX2-FP-NEXT: vpbroadcastd 36(%r9), %ymm4 1876; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] 1877; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1878; AVX2-FP-NEXT: vpbroadcastd 32(%rcx), %xmm3 1879; AVX2-FP-NEXT: vpbroadcastd 32(%rdx), %xmm4 1880; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 1881; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm9 1882; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1883; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] 1884; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] 1885; AVX2-FP-NEXT: vpbroadcastq %xmm6, %ymm2 1886; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] 1887; AVX2-FP-NEXT: vmovdqa 32(%r9), %xmm15 1888; AVX2-FP-NEXT: vpbroadcastd %xmm15, %ymm2 1889; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] 1890; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1891; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm14 1892; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6] 1893; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,2,3,5,5,6,7] 1894; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] 1895; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 1896; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm10 1897; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm8 1898; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] 1899; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] 1900; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero 1901; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] 1902; AVX2-FP-NEXT: vpbroadcastd 52(%r9), %ymm3 1903; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] 1904; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1905; AVX2-FP-NEXT: vpbroadcastd (%rcx), %xmm1 1906; AVX2-FP-NEXT: vpbroadcastd (%rdx), %xmm3 1907; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 1908; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] 1909; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] 1910; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 1911; AVX2-FP-NEXT: vpbroadcastq %xmm13, %ymm1 1912; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 1913; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1 1914; AVX2-FP-NEXT: vpbroadcastd %xmm1, %ymm2 1915; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] 1916; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1917; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm2 1918; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm0 1919; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,2,4,5,6,6] 1920; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] 1921; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] 1922; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,1,2,3] 1923; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm7 1924; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm5 1925; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] 1926; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] 1927; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero 1928; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] 1929; AVX2-FP-NEXT: vpbroadcastd 20(%r9), %ymm12 1930; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] 1931; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1932; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 1933; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload 1934; AVX2-FP-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] 1935; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] 1936; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload 1937; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 1938; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] 1939; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] 1940; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5],ymm13[6,7] 1941; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 1942; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] 1943; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4,5,6],ymm1[7] 1944; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7] 1945; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7] 1946; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] 1947; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm11 1948; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[2,1,3,3,6,5,7,7] 1949; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] 1950; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7] 1951; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] 1952; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] 1953; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] 1954; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 1955; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload 1956; AVX2-FP-NEXT: # xmm13 = xmm12[2],mem[2],xmm12[3],mem[3] 1957; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] 1958; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 1959; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm13 1960; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] 1961; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] 1962; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5],ymm6[6,7] 1963; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[2,2,3,3] 1964; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] 1965; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7] 1966; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[4],ymm14[4],ymm9[5],ymm14[5] 1967; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] 1968; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] 1969; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] 1970; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] 1971; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] 1972; AVX2-FP-NEXT: vpbroadcastd 48(%r9), %ymm9 1973; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] 1974; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] 1975; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] 1976; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm9[2,3] 1977; AVX2-FP-NEXT: vmovdqa (%r8), %ymm9 1978; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,3,3,6,5,7,7] 1979; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] 1980; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7] 1981; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7] 1982; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] 1983; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] 1984; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] 1985; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] 1986; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 1987; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 1988; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 1989; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] 1990; AVX2-FP-NEXT: vpbroadcastd 16(%r9), %ymm2 1991; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] 1992; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1993; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax) 1994; AVX2-FP-NEXT: vmovdqa %ymm3, 160(%rax) 1995; AVX2-FP-NEXT: vmovdqa %ymm8, 288(%rax) 1996; AVX2-FP-NEXT: vmovdqa %ymm6, 256(%rax) 1997; AVX2-FP-NEXT: vmovdqa %ymm4, 352(%rax) 1998; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%rax) 1999; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2000; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rax) 2001; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2002; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) 2003; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2004; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rax) 2005; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2006; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rax) 2007; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2008; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax) 2009; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2010; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) 2011; AVX2-FP-NEXT: addq $200, %rsp 2012; AVX2-FP-NEXT: vzeroupper 2013; AVX2-FP-NEXT: retq 2014; 2015; AVX2-FCP-LABEL: store_i32_stride6_vf16: 2016; AVX2-FCP: # %bb.0: 2017; AVX2-FCP-NEXT: subq $232, %rsp 2018; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm12 2019; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm1 2020; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm15 2021; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 2022; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] 2023; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 2024; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm8 2025; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm6 2026; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2027; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,2,2,3] 2028; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm9 2029; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm10 2030; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2031; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,3] 2032; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 2033; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] 2034; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] 2035; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm0 2036; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm4 2037; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero 2038; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] 2039; AVX2-FCP-NEXT: vpbroadcastd 4(%r9), %ymm5 2040; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] 2041; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2042; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 2043; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2044; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 2045; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,2,2,3] 2046; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,3] 2047; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] 2048; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] 2049; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] 2050; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero 2051; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] 2052; AVX2-FCP-NEXT: vpbroadcastd 36(%r9), %ymm5 2053; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] 2054; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2055; AVX2-FCP-NEXT: vpbroadcastd 32(%rcx), %xmm3 2056; AVX2-FCP-NEXT: vpbroadcastd 32(%rdx), %xmm5 2057; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] 2058; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm10 2059; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 2060; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm11 2061; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] 2062; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] 2063; AVX2-FCP-NEXT: vpbroadcastq %xmm4, %ymm2 2064; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] 2065; AVX2-FCP-NEXT: vpbroadcastd 32(%r9), %ymm2 2066; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] 2067; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2068; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,2,2,4,5,6,6] 2069; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7] 2070; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] 2071; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 2072; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2073; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 2074; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm14 2075; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7] 2076; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] 2077; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero 2078; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] 2079; AVX2-FCP-NEXT: vpbroadcastd 52(%r9), %ymm3 2080; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] 2081; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2082; AVX2-FCP-NEXT: vpbroadcastd (%rcx), %xmm1 2083; AVX2-FCP-NEXT: vpbroadcastd (%rdx), %xmm3 2084; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 2085; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] 2086; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] 2087; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] 2088; AVX2-FCP-NEXT: vpbroadcastq %xmm0, %ymm0 2089; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 2090; AVX2-FCP-NEXT: vpbroadcastd (%r9), %ymm1 2091; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 2092; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2093; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm5 2094; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm1 2095; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,2,4,5,6,6] 2096; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,1,2,3,5,5,6,7] 2097; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] 2098; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm3[2,1,2,3] 2099; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm13 2100; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm4 2101; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] 2102; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5],ymm15[6,7] 2103; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero 2104; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5,6,7] 2105; AVX2-FCP-NEXT: vpbroadcastd 20(%r9), %ymm15 2106; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] 2107; AVX2-FCP-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill 2108; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] 2109; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] 2110; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 2111; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm8 2112; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [2,2,3,3,2,2,3,3] 2113; AVX2-FCP-NEXT: # ymm12 = mem[0,1,0,1] 2114; AVX2-FCP-NEXT: vpermd %ymm8, %ymm12, %ymm9 2115; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5],ymm9[6,7] 2116; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm9 2117; AVX2-FCP-NEXT: vpermd %ymm9, %ymm12, %ymm15 2118; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm15[1],ymm7[2,3,4,5,6],ymm15[7] 2119; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2120; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] 2121; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] 2122; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm15[2,3] 2123; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm15 2124; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] 2125; AVX2-FCP-NEXT: vpermd %ymm15, %ymm6, %ymm7 2126; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5],ymm7[6,7] 2127; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm7 2128; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4,6,2,3,4,6,6,7] 2129; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm6 2130; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7] 2131; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2132; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 2133; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload 2134; AVX2-FCP-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] 2135; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] 2136; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 2137; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm6 2138; AVX2-FCP-NEXT: vpermd %ymm15, %ymm12, %ymm2 2139; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5],ymm2[6,7] 2140; AVX2-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm6 2141; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7] 2142; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] 2143; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 2144; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[1],ymm14[1],ymm7[4],ymm14[4],ymm7[5],ymm14[5] 2145; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] 2146; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] 2147; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] 2148; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5],ymm6[6,7] 2149; AVX2-FCP-NEXT: vpbroadcastd 48(%r9), %ymm7 2150; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] 2151; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7] 2152; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] 2153; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm7[2,3] 2154; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] 2155; AVX2-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7 2156; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5],ymm7[6,7] 2157; AVX2-FCP-NEXT: vpermd %ymm9, %ymm0, %ymm7 2158; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7] 2159; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[4],ymm1[4],ymm5[5],ymm1[5] 2160; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[4],ymm4[4],ymm13[5],ymm4[5] 2161; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 2162; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 2163; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 2164; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] 2165; AVX2-FCP-NEXT: vpbroadcastd 16(%r9), %ymm1 2166; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 2167; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2168; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax) 2169; AVX2-FCP-NEXT: vmovdqa %ymm3, 160(%rax) 2170; AVX2-FCP-NEXT: vmovdqa %ymm6, 288(%rax) 2171; AVX2-FCP-NEXT: vmovdqa %ymm2, 256(%rax) 2172; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2173; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%rax) 2174; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2175; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax) 2176; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2177; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rax) 2178; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2179; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) 2180; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2181; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rax) 2182; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2183; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rax) 2184; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2185; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax) 2186; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2187; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) 2188; AVX2-FCP-NEXT: addq $232, %rsp 2189; AVX2-FCP-NEXT: vzeroupper 2190; AVX2-FCP-NEXT: retq 2191; 2192; AVX512-LABEL: store_i32_stride6_vf16: 2193; AVX512: # %bb.0: 2194; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 2195; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 2196; AVX512-NEXT: vmovdqa64 (%rsi), %zmm3 2197; AVX512-NEXT: vmovdqa64 (%rdx), %zmm4 2198; AVX512-NEXT: vmovdqa64 (%rcx), %zmm5 2199; AVX512-NEXT: vmovdqa64 (%r8), %zmm1 2200; AVX512-NEXT: vmovdqa64 (%r9), %zmm0 2201; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 2202; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 2203; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 2204; AVX512-NEXT: vmovdqa (%rdx), %ymm7 2205; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] 2206; AVX512-NEXT: vpermi2d (%rcx), %ymm7, %ymm8 2207; AVX512-NEXT: movb $36, %cl 2208; AVX512-NEXT: kmovw %ecx, %k1 2209; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7] 2210; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 2211; AVX512-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 2212; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 2213; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 2214; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] 2215; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 2216; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 2217; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2218; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 2219; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 2220; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 2221; AVX512-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 2222; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 2223; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 2224; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 2225; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2226; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 2227; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 2228; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 2229; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 2230; AVX512-NEXT: movb $-110, %cl 2231; AVX512-NEXT: kmovw %ecx, %k2 2232; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} 2233; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 2234; AVX512-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 2235; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 2236; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 2237; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 2238; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2239; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 2240; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 2241; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 2242; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 2243; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} 2244; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 2245; AVX512-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 2246; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 2247; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 2248; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 2249; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2250; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 2251; AVX512-NEXT: vmovdqa (%rdi), %ymm11 2252; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] 2253; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3] 2254; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 2255; AVX512-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 2256; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 2257; AVX512-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 2258; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 2259; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 2260; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 2261; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] 2262; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] 2263; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 2264; AVX512-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 2265; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 2266; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 2267; AVX512-NEXT: vmovdqa64 %zmm10, (%rax) 2268; AVX512-NEXT: vmovdqa64 %zmm9, 192(%rax) 2269; AVX512-NEXT: vmovdqa64 %zmm1, 320(%rax) 2270; AVX512-NEXT: vmovdqa64 %zmm8, 256(%rax) 2271; AVX512-NEXT: vmovdqa64 %zmm7, 128(%rax) 2272; AVX512-NEXT: vmovdqa64 %zmm6, 64(%rax) 2273; AVX512-NEXT: vzeroupper 2274; AVX512-NEXT: retq 2275; 2276; AVX512-FCP-LABEL: store_i32_stride6_vf16: 2277; AVX512-FCP: # %bb.0: 2278; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2279; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 2280; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 2281; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 2282; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 2283; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm1 2284; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm0 2285; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 2286; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 2287; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm6 2288; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 2289; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2290; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 2291; AVX512-FCP-NEXT: movb $-110, %cl 2292; AVX512-FCP-NEXT: kmovw %ecx, %k2 2293; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} 2294; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 2295; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 2296; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 2297; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 2298; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0] 2299; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 2300; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 2301; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2302; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 2303; AVX512-FCP-NEXT: movb $36, %cl 2304; AVX512-FCP-NEXT: kmovw %ecx, %k1 2305; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 2306; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 2307; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 2308; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 2309; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 2310; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] 2311; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2312; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 2313; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 2314; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 2315; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 2316; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} 2317; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 2318; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 2319; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 2320; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 2321; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 2322; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2323; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 2324; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 2325; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 2326; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 2327; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} 2328; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 2329; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 2330; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 2331; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 2332; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] 2333; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 2334; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 2335; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 2336; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 2337; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} 2338; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 2339; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 2340; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 2341; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 2342; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] 2343; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2344; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 2345; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 2346; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 2347; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 2348; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} 2349; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 2350; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 2351; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 2352; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 2353; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) 2354; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) 2355; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) 2356; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) 2357; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) 2358; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rax) 2359; AVX512-FCP-NEXT: vzeroupper 2360; AVX512-FCP-NEXT: retq 2361; 2362; AVX512DQ-LABEL: store_i32_stride6_vf16: 2363; AVX512DQ: # %bb.0: 2364; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 2365; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 2366; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm3 2367; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm4 2368; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm5 2369; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm1 2370; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm0 2371; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 2372; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 2373; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 2374; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm7 2375; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] 2376; AVX512DQ-NEXT: vpermi2d (%rcx), %ymm7, %ymm8 2377; AVX512DQ-NEXT: movb $36, %cl 2378; AVX512DQ-NEXT: kmovw %ecx, %k1 2379; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7] 2380; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 2381; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 2382; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 2383; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 2384; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] 2385; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 2386; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 2387; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2388; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 2389; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 2390; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 2391; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 2392; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 2393; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 2394; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 2395; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2396; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 2397; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 2398; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 2399; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 2400; AVX512DQ-NEXT: movb $-110, %cl 2401; AVX512DQ-NEXT: kmovw %ecx, %k2 2402; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} 2403; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 2404; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 2405; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 2406; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 2407; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 2408; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2409; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 2410; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 2411; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 2412; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 2413; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} 2414; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 2415; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 2416; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 2417; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 2418; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 2419; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2420; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 2421; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm11 2422; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] 2423; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3] 2424; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 2425; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 2426; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 2427; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 2428; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 2429; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 2430; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 2431; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] 2432; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] 2433; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 2434; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 2435; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 2436; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 2437; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rax) 2438; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rax) 2439; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rax) 2440; AVX512DQ-NEXT: vmovdqa64 %zmm8, 256(%rax) 2441; AVX512DQ-NEXT: vmovdqa64 %zmm7, 128(%rax) 2442; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rax) 2443; AVX512DQ-NEXT: vzeroupper 2444; AVX512DQ-NEXT: retq 2445; 2446; AVX512DQ-FCP-LABEL: store_i32_stride6_vf16: 2447; AVX512DQ-FCP: # %bb.0: 2448; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2449; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 2450; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 2451; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 2452; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 2453; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm1 2454; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm0 2455; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 2456; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 2457; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm6 2458; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 2459; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2460; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 2461; AVX512DQ-FCP-NEXT: movb $-110, %cl 2462; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 2463; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} 2464; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 2465; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 2466; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 2467; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 2468; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0] 2469; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 2470; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 2471; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2472; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 2473; AVX512DQ-FCP-NEXT: movb $36, %cl 2474; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 2475; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 2476; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 2477; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 2478; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 2479; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 2480; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] 2481; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2482; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 2483; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 2484; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 2485; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 2486; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} 2487; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 2488; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 2489; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 2490; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 2491; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 2492; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2493; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 2494; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 2495; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 2496; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 2497; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} 2498; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 2499; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 2500; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 2501; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 2502; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] 2503; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 2504; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 2505; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 2506; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 2507; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} 2508; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 2509; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 2510; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 2511; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 2512; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] 2513; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2514; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 2515; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 2516; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 2517; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 2518; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} 2519; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 2520; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 2521; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 2522; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 2523; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) 2524; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) 2525; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) 2526; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) 2527; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) 2528; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rax) 2529; AVX512DQ-FCP-NEXT: vzeroupper 2530; AVX512DQ-FCP-NEXT: retq 2531; 2532; AVX512BW-LABEL: store_i32_stride6_vf16: 2533; AVX512BW: # %bb.0: 2534; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 2535; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 2536; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 2537; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 2538; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm5 2539; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 2540; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm0 2541; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 2542; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 2543; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 2544; AVX512BW-NEXT: vmovdqa (%rdx), %ymm7 2545; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] 2546; AVX512BW-NEXT: vpermi2d (%rcx), %ymm7, %ymm8 2547; AVX512BW-NEXT: movb $36, %cl 2548; AVX512BW-NEXT: kmovd %ecx, %k1 2549; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7] 2550; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 2551; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 2552; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 2553; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 2554; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] 2555; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 2556; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 2557; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2558; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 2559; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 2560; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 2561; AVX512BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 2562; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 2563; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 2564; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 2565; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2566; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 2567; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 2568; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 2569; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 2570; AVX512BW-NEXT: movb $-110, %cl 2571; AVX512BW-NEXT: kmovd %ecx, %k2 2572; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} 2573; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 2574; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 2575; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 2576; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 2577; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 2578; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2579; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 2580; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 2581; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 2582; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 2583; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} 2584; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 2585; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 2586; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 2587; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 2588; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 2589; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2590; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 2591; AVX512BW-NEXT: vmovdqa (%rdi), %ymm11 2592; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] 2593; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3] 2594; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 2595; AVX512BW-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 2596; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 2597; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 2598; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 2599; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 2600; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 2601; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] 2602; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] 2603; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 2604; AVX512BW-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 2605; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 2606; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 2607; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) 2608; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rax) 2609; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%rax) 2610; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rax) 2611; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rax) 2612; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rax) 2613; AVX512BW-NEXT: vzeroupper 2614; AVX512BW-NEXT: retq 2615; 2616; AVX512BW-FCP-LABEL: store_i32_stride6_vf16: 2617; AVX512BW-FCP: # %bb.0: 2618; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2619; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 2620; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 2621; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 2622; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 2623; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 2624; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 2625; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 2626; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 2627; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm6 2628; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 2629; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2630; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 2631; AVX512BW-FCP-NEXT: movb $-110, %cl 2632; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 2633; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} 2634; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 2635; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 2636; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 2637; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 2638; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0] 2639; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 2640; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 2641; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2642; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 2643; AVX512BW-FCP-NEXT: movb $36, %cl 2644; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 2645; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 2646; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 2647; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 2648; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 2649; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 2650; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] 2651; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2652; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 2653; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 2654; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 2655; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 2656; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} 2657; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 2658; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 2659; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 2660; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 2661; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 2662; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2663; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 2664; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 2665; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 2666; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 2667; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} 2668; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 2669; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 2670; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 2671; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 2672; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] 2673; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 2674; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 2675; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 2676; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 2677; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} 2678; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 2679; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 2680; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 2681; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 2682; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] 2683; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2684; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 2685; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 2686; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 2687; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 2688; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} 2689; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 2690; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 2691; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 2692; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 2693; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) 2694; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) 2695; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) 2696; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) 2697; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) 2698; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) 2699; AVX512BW-FCP-NEXT: vzeroupper 2700; AVX512BW-FCP-NEXT: retq 2701; 2702; AVX512DQ-BW-LABEL: store_i32_stride6_vf16: 2703; AVX512DQ-BW: # %bb.0: 2704; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 2705; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 2706; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm3 2707; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4 2708; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm5 2709; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm1 2710; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm0 2711; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 2712; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 2713; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 2714; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm7 2715; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] 2716; AVX512DQ-BW-NEXT: vpermi2d (%rcx), %ymm7, %ymm8 2717; AVX512DQ-BW-NEXT: movb $36, %cl 2718; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 2719; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7] 2720; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 2721; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 2722; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 2723; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 2724; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] 2725; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 2726; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 2727; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2728; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 2729; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 2730; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 2731; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 2732; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 2733; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 2734; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 2735; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2736; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 2737; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 2738; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 2739; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 2740; AVX512DQ-BW-NEXT: movb $-110, %cl 2741; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 2742; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} 2743; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 2744; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 2745; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 2746; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 2747; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 2748; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2749; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 2750; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 2751; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 2752; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 2753; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} 2754; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 2755; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 2756; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 2757; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 2758; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 2759; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2760; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 2761; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm11 2762; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] 2763; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3] 2764; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 2765; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 2766; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 2767; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 2768; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 2769; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 2770; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 2771; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] 2772; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] 2773; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 2774; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 2775; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 2776; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 2777; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rax) 2778; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 192(%rax) 2779; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 320(%rax) 2780; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 256(%rax) 2781; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 128(%rax) 2782; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 64(%rax) 2783; AVX512DQ-BW-NEXT: vzeroupper 2784; AVX512DQ-BW-NEXT: retq 2785; 2786; AVX512DQ-BW-FCP-LABEL: store_i32_stride6_vf16: 2787; AVX512DQ-BW-FCP: # %bb.0: 2788; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2789; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 2790; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 2791; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 2792; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 2793; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 2794; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 2795; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 2796; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 2797; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm6 2798; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 2799; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2800; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 2801; AVX512DQ-BW-FCP-NEXT: movb $-110, %cl 2802; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 2803; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} 2804; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 2805; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 2806; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 2807; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 2808; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0] 2809; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 2810; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 2811; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 2812; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 2813; AVX512DQ-BW-FCP-NEXT: movb $36, %cl 2814; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 2815; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} 2816; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 2817; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 2818; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 2819; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 2820; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] 2821; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2822; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 2823; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 2824; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 2825; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 2826; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} 2827; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 2828; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 2829; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 2830; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 2831; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 2832; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 2833; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 2834; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 2835; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 2836; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 2837; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} 2838; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 2839; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 2840; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 2841; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 2842; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] 2843; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 2844; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 2845; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 2846; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 2847; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} 2848; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 2849; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 2850; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 2851; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 2852; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] 2853; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2854; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 2855; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 2856; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 2857; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 2858; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} 2859; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 2860; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 2861; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 2862; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 2863; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) 2864; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) 2865; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) 2866; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) 2867; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) 2868; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) 2869; AVX512DQ-BW-FCP-NEXT: vzeroupper 2870; AVX512DQ-BW-FCP-NEXT: retq 2871 %in.vec0 = load <16 x i32>, ptr %in.vecptr0, align 64 2872 %in.vec1 = load <16 x i32>, ptr %in.vecptr1, align 64 2873 %in.vec2 = load <16 x i32>, ptr %in.vecptr2, align 64 2874 %in.vec3 = load <16 x i32>, ptr %in.vecptr3, align 64 2875 %in.vec4 = load <16 x i32>, ptr %in.vecptr4, align 64 2876 %in.vec5 = load <16 x i32>, ptr %in.vecptr5, align 64 2877 %1 = shufflevector <16 x i32> %in.vec0, <16 x i32> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2878 %2 = shufflevector <16 x i32> %in.vec2, <16 x i32> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2879 %3 = shufflevector <16 x i32> %in.vec4, <16 x i32> %in.vec5, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2880 %4 = shufflevector <32 x i32> %1, <32 x i32> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 2881 %5 = shufflevector <32 x i32> %3, <32 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2882 %6 = shufflevector <64 x i32> %4, <64 x i32> %5, <96 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95> 2883 %interleaved.vec = shufflevector <96 x i32> %6, <96 x i32> poison, <96 x i32> <i32 0, i32 16, i32 32, i32 48, i32 64, i32 80, i32 1, i32 17, i32 33, i32 49, i32 65, i32 81, i32 2, i32 18, i32 34, i32 50, i32 66, i32 82, i32 3, i32 19, i32 35, i32 51, i32 67, i32 83, i32 4, i32 20, i32 36, i32 52, i32 68, i32 84, i32 5, i32 21, i32 37, i32 53, i32 69, i32 85, i32 6, i32 22, i32 38, i32 54, i32 70, i32 86, i32 7, i32 23, i32 39, i32 55, i32 71, i32 87, i32 8, i32 24, i32 40, i32 56, i32 72, i32 88, i32 9, i32 25, i32 41, i32 57, i32 73, i32 89, i32 10, i32 26, i32 42, i32 58, i32 74, i32 90, i32 11, i32 27, i32 43, i32 59, i32 75, i32 91, i32 12, i32 28, i32 44, i32 60, i32 76, i32 92, i32 13, i32 29, i32 45, i32 61, i32 77, i32 93, i32 14, i32 30, i32 46, i32 62, i32 78, i32 94, i32 15, i32 31, i32 47, i32 63, i32 79, i32 95> 2884 store <96 x i32> %interleaved.vec, ptr %out.vec, align 64 2885 ret void 2886} 2887 2888define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { 2889; SSE-LABEL: store_i32_stride6_vf32: 2890; SSE: # %bb.0: 2891; SSE-NEXT: subq $456, %rsp # imm = 0x1C8 2892; SSE-NEXT: movaps (%rdi), %xmm9 2893; SSE-NEXT: movaps 16(%rdi), %xmm10 2894; SSE-NEXT: movaps (%rsi), %xmm4 2895; SSE-NEXT: movaps 16(%rsi), %xmm0 2896; SSE-NEXT: movaps (%rdx), %xmm11 2897; SSE-NEXT: movaps 16(%rdx), %xmm12 2898; SSE-NEXT: movaps (%rcx), %xmm5 2899; SSE-NEXT: movaps 16(%rcx), %xmm1 2900; SSE-NEXT: movaps (%r8), %xmm6 2901; SSE-NEXT: movaps 16(%r8), %xmm2 2902; SSE-NEXT: movaps (%r9), %xmm7 2903; SSE-NEXT: movaps 16(%r9), %xmm3 2904; SSE-NEXT: movaps %xmm11, %xmm13 2905; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] 2906; SSE-NEXT: movaps %xmm9, %xmm8 2907; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] 2908; SSE-NEXT: movaps %xmm7, %xmm14 2909; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm6[0] 2910; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm8[2,3] 2911; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2912; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm13[0] 2913; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2914; SSE-NEXT: movaps %xmm6, %xmm8 2915; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1] 2916; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[0,2] 2917; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2918; SSE-NEXT: movaps %xmm5, %xmm8 2919; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] 2920; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] 2921; SSE-NEXT: movaps %xmm6, %xmm4 2922; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] 2923; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm9[2,3] 2924; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2925; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,0] 2926; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2927; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm7[3,3] 2928; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] 2929; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm6[0,2] 2930; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2931; SSE-NEXT: movaps %xmm12, %xmm5 2932; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] 2933; SSE-NEXT: movaps %xmm10, %xmm4 2934; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 2935; SSE-NEXT: movaps %xmm3, %xmm6 2936; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] 2937; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] 2938; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2939; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 2940; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2941; SSE-NEXT: movaps %xmm2, %xmm4 2942; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 2943; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] 2944; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2945; SSE-NEXT: movaps %xmm1, %xmm4 2946; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm12[1] 2947; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] 2948; SSE-NEXT: movaps %xmm2, %xmm0 2949; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] 2950; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[2,3] 2951; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2952; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm4[2,0] 2953; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2954; SSE-NEXT: movaps 32(%rdi), %xmm5 2955; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 2956; SSE-NEXT: movaps 32(%rdx), %xmm6 2957; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] 2958; SSE-NEXT: movaps 32(%rcx), %xmm0 2959; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm2[0,2] 2960; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2961; SSE-NEXT: movaps %xmm6, %xmm7 2962; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] 2963; SSE-NEXT: movaps 32(%rsi), %xmm1 2964; SSE-NEXT: movaps %xmm5, %xmm4 2965; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 2966; SSE-NEXT: movaps 32(%r8), %xmm2 2967; SSE-NEXT: movaps 32(%r9), %xmm3 2968; SSE-NEXT: movaps %xmm3, %xmm8 2969; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] 2970; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] 2971; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2972; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] 2973; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2974; SSE-NEXT: movaps %xmm2, %xmm4 2975; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 2976; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[0,2] 2977; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2978; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] 2979; SSE-NEXT: movaps %xmm0, %xmm1 2980; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] 2981; SSE-NEXT: movaps %xmm2, %xmm4 2982; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 2983; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[2,3] 2984; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2985; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] 2986; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2987; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 2988; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] 2989; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] 2990; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2991; SSE-NEXT: movaps 48(%rdx), %xmm6 2992; SSE-NEXT: movaps 48(%rcx), %xmm0 2993; SSE-NEXT: movaps %xmm6, %xmm5 2994; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 2995; SSE-NEXT: movaps 48(%rdi), %xmm7 2996; SSE-NEXT: movaps 48(%rsi), %xmm1 2997; SSE-NEXT: movaps %xmm7, %xmm4 2998; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 2999; SSE-NEXT: movaps 48(%r8), %xmm2 3000; SSE-NEXT: movaps 48(%r9), %xmm3 3001; SSE-NEXT: movaps %xmm3, %xmm8 3002; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] 3003; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] 3004; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3005; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 3006; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3007; SSE-NEXT: movaps %xmm2, %xmm4 3008; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 3009; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] 3010; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3011; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] 3012; SSE-NEXT: movaps %xmm0, %xmm1 3013; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] 3014; SSE-NEXT: movaps %xmm2, %xmm4 3015; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 3016; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] 3017; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3018; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] 3019; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3020; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 3021; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] 3022; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] 3023; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3024; SSE-NEXT: movaps 64(%rdx), %xmm6 3025; SSE-NEXT: movaps 64(%rcx), %xmm0 3026; SSE-NEXT: movaps %xmm6, %xmm5 3027; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 3028; SSE-NEXT: movaps 64(%rdi), %xmm7 3029; SSE-NEXT: movaps 64(%rsi), %xmm1 3030; SSE-NEXT: movaps %xmm7, %xmm4 3031; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 3032; SSE-NEXT: movaps 64(%r8), %xmm2 3033; SSE-NEXT: movaps 64(%r9), %xmm3 3034; SSE-NEXT: movaps %xmm3, %xmm8 3035; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] 3036; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] 3037; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3038; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 3039; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3040; SSE-NEXT: movaps %xmm2, %xmm4 3041; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 3042; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] 3043; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3044; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] 3045; SSE-NEXT: movaps %xmm0, %xmm1 3046; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] 3047; SSE-NEXT: movaps %xmm2, %xmm4 3048; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 3049; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] 3050; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill 3051; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] 3052; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3053; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 3054; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] 3055; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] 3056; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3057; SSE-NEXT: movaps 80(%rdx), %xmm6 3058; SSE-NEXT: movaps 80(%rcx), %xmm0 3059; SSE-NEXT: movaps %xmm6, %xmm5 3060; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 3061; SSE-NEXT: movaps 80(%rdi), %xmm7 3062; SSE-NEXT: movaps 80(%rsi), %xmm1 3063; SSE-NEXT: movaps %xmm7, %xmm4 3064; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 3065; SSE-NEXT: movaps 80(%r8), %xmm2 3066; SSE-NEXT: movaps 80(%r9), %xmm3 3067; SSE-NEXT: movaps %xmm3, %xmm8 3068; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] 3069; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] 3070; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3071; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 3072; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3073; SSE-NEXT: movaps %xmm2, %xmm4 3074; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 3075; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] 3076; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3077; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] 3078; SSE-NEXT: movaps %xmm0, %xmm1 3079; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] 3080; SSE-NEXT: movaps %xmm2, %xmm4 3081; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 3082; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] 3083; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3084; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] 3085; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3086; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 3087; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] 3088; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] 3089; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3090; SSE-NEXT: movaps 96(%rdx), %xmm9 3091; SSE-NEXT: movaps 96(%rcx), %xmm0 3092; SSE-NEXT: movaps %xmm9, %xmm14 3093; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] 3094; SSE-NEXT: movaps 96(%rdi), %xmm11 3095; SSE-NEXT: movaps 96(%rsi), %xmm1 3096; SSE-NEXT: movaps %xmm11, %xmm13 3097; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] 3098; SSE-NEXT: movaps 96(%r8), %xmm2 3099; SSE-NEXT: movaps 96(%r9), %xmm3 3100; SSE-NEXT: movaps %xmm3, %xmm15 3101; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] 3102; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm13[2,3] 3103; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] 3104; SSE-NEXT: movaps %xmm2, %xmm4 3105; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 3106; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2] 3107; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] 3108; SSE-NEXT: movaps %xmm0, %xmm1 3109; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] 3110; SSE-NEXT: movaps %xmm2, %xmm8 3111; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] 3112; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm11[2,3] 3113; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] 3114; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 3115; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] 3116; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] 3117; SSE-NEXT: movaps 112(%rdx), %xmm3 3118; SSE-NEXT: movaps 112(%rcx), %xmm12 3119; SSE-NEXT: movaps %xmm3, %xmm5 3120; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] 3121; SSE-NEXT: movaps 112(%rdi), %xmm2 3122; SSE-NEXT: movaps 112(%rsi), %xmm10 3123; SSE-NEXT: movaps %xmm2, %xmm4 3124; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] 3125; SSE-NEXT: movaps 112(%r8), %xmm1 3126; SSE-NEXT: movaps 112(%r9), %xmm7 3127; SSE-NEXT: movaps %xmm7, %xmm6 3128; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] 3129; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] 3130; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 3131; SSE-NEXT: movaps %xmm1, %xmm0 3132; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] 3133; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] 3134; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] 3135; SSE-NEXT: movaps %xmm12, %xmm0 3136; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] 3137; SSE-NEXT: movaps %xmm1, %xmm10 3138; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] 3139; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm2[2,3] 3140; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] 3141; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] 3142; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] 3143; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] 3144; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 3145; SSE-NEXT: movaps %xmm3, 752(%rax) 3146; SSE-NEXT: movaps %xmm10, 736(%rax) 3147; SSE-NEXT: movaps %xmm2, 720(%rax) 3148; SSE-NEXT: movaps %xmm5, 704(%rax) 3149; SSE-NEXT: movaps %xmm6, 688(%rax) 3150; SSE-NEXT: movaps %xmm4, 672(%rax) 3151; SSE-NEXT: movaps %xmm9, 656(%rax) 3152; SSE-NEXT: movaps %xmm8, 640(%rax) 3153; SSE-NEXT: movaps %xmm11, 624(%rax) 3154; SSE-NEXT: movaps %xmm14, 608(%rax) 3155; SSE-NEXT: movaps %xmm15, 592(%rax) 3156; SSE-NEXT: movaps %xmm13, 576(%rax) 3157; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3158; SSE-NEXT: movaps %xmm0, 560(%rax) 3159; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3160; SSE-NEXT: movaps %xmm0, 544(%rax) 3161; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3162; SSE-NEXT: movaps %xmm0, 528(%rax) 3163; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3164; SSE-NEXT: movaps %xmm0, 512(%rax) 3165; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3166; SSE-NEXT: movaps %xmm0, 496(%rax) 3167; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3168; SSE-NEXT: movaps %xmm0, 480(%rax) 3169; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3170; SSE-NEXT: movaps %xmm0, 464(%rax) 3171; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 3172; SSE-NEXT: movaps %xmm0, 448(%rax) 3173; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3174; SSE-NEXT: movaps %xmm0, 432(%rax) 3175; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3176; SSE-NEXT: movaps %xmm0, 416(%rax) 3177; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3178; SSE-NEXT: movaps %xmm0, 400(%rax) 3179; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3180; SSE-NEXT: movaps %xmm0, 384(%rax) 3181; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3182; SSE-NEXT: movaps %xmm0, 368(%rax) 3183; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3184; SSE-NEXT: movaps %xmm0, 352(%rax) 3185; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3186; SSE-NEXT: movaps %xmm0, 336(%rax) 3187; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3188; SSE-NEXT: movaps %xmm0, 320(%rax) 3189; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3190; SSE-NEXT: movaps %xmm0, 304(%rax) 3191; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3192; SSE-NEXT: movaps %xmm0, 288(%rax) 3193; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3194; SSE-NEXT: movaps %xmm0, 272(%rax) 3195; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3196; SSE-NEXT: movaps %xmm0, 256(%rax) 3197; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3198; SSE-NEXT: movaps %xmm0, 240(%rax) 3199; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3200; SSE-NEXT: movaps %xmm0, 224(%rax) 3201; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3202; SSE-NEXT: movaps %xmm0, 208(%rax) 3203; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3204; SSE-NEXT: movaps %xmm0, 192(%rax) 3205; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3206; SSE-NEXT: movaps %xmm0, 176(%rax) 3207; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3208; SSE-NEXT: movaps %xmm0, 160(%rax) 3209; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3210; SSE-NEXT: movaps %xmm0, 144(%rax) 3211; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3212; SSE-NEXT: movaps %xmm0, 128(%rax) 3213; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3214; SSE-NEXT: movaps %xmm0, 112(%rax) 3215; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3216; SSE-NEXT: movaps %xmm0, 96(%rax) 3217; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3218; SSE-NEXT: movaps %xmm0, 80(%rax) 3219; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3220; SSE-NEXT: movaps %xmm0, 64(%rax) 3221; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3222; SSE-NEXT: movaps %xmm0, 48(%rax) 3223; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3224; SSE-NEXT: movaps %xmm0, 32(%rax) 3225; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3226; SSE-NEXT: movaps %xmm0, 16(%rax) 3227; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3228; SSE-NEXT: movaps %xmm0, (%rax) 3229; SSE-NEXT: addq $456, %rsp # imm = 0x1C8 3230; SSE-NEXT: retq 3231; 3232; AVX-LABEL: store_i32_stride6_vf32: 3233; AVX: # %bb.0: 3234; AVX-NEXT: subq $1032, %rsp # imm = 0x408 3235; AVX-NEXT: vmovaps (%rdi), %ymm12 3236; AVX-NEXT: vmovaps (%rsi), %ymm8 3237; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3238; AVX-NEXT: vmovaps (%rdx), %ymm4 3239; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3240; AVX-NEXT: vmovaps (%rcx), %ymm6 3241; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3242; AVX-NEXT: vmovaps (%r8), %ymm5 3243; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3244; AVX-NEXT: vmovaps (%rcx), %xmm1 3245; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3246; AVX-NEXT: vmovaps 32(%rcx), %xmm2 3247; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3248; AVX-NEXT: vmovaps (%rdx), %xmm0 3249; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3250; AVX-NEXT: vmovaps 32(%rdx), %xmm3 3251; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3252; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] 3253; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 3254; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 3255; AVX-NEXT: vmovaps (%rsi), %xmm1 3256; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3257; AVX-NEXT: vmovaps (%rdi), %xmm7 3258; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3259; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] 3260; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3261; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 3262; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 3263; AVX-NEXT: vbroadcastss 4(%r8), %xmm1 3264; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 3265; AVX-NEXT: vbroadcastss 4(%r9), %ymm1 3266; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 3267; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3268; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm8[0],ymm12[1],ymm8[1],ymm12[4],ymm8[4],ymm12[5],ymm8[5] 3269; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 3270; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] 3271; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 3272; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] 3273; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 3274; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] 3275; AVX-NEXT: vbroadcastss 16(%r9), %ymm1 3276; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 3277; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3278; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,2],xmm2[1,2] 3279; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 3280; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 3281; AVX-NEXT: vmovaps 32(%rsi), %xmm1 3282; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3283; AVX-NEXT: vmovaps 32(%rdi), %xmm2 3284; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill 3285; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 3286; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3287; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 3288; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 3289; AVX-NEXT: vbroadcastss 36(%r8), %xmm1 3290; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 3291; AVX-NEXT: vbroadcastss 36(%r9), %ymm1 3292; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 3293; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3294; AVX-NEXT: vmovaps 32(%rdi), %ymm1 3295; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3296; AVX-NEXT: vmovaps 32(%rsi), %ymm0 3297; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3298; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 3299; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 3300; AVX-NEXT: vmovaps 32(%rdx), %ymm8 3301; AVX-NEXT: vmovaps 32(%rcx), %ymm13 3302; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] 3303; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3304; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 3305; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] 3306; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 3307; AVX-NEXT: vmovaps 32(%r8), %ymm1 3308; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3309; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 3310; AVX-NEXT: vbroadcastss 48(%r9), %ymm1 3311; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 3312; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3313; AVX-NEXT: vmovaps 64(%rcx), %xmm1 3314; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3315; AVX-NEXT: vmovaps 64(%rdx), %xmm0 3316; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3317; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] 3318; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 3319; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 3320; AVX-NEXT: vmovaps 64(%rsi), %xmm1 3321; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3322; AVX-NEXT: vmovaps 64(%rdi), %xmm2 3323; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3324; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 3325; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3326; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 3327; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 3328; AVX-NEXT: vbroadcastss 68(%r8), %xmm1 3329; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 3330; AVX-NEXT: vbroadcastss 68(%r9), %ymm1 3331; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 3332; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3333; AVX-NEXT: vmovaps 64(%rdi), %ymm6 3334; AVX-NEXT: vmovaps 64(%rsi), %ymm14 3335; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm14[0],ymm6[1],ymm14[1],ymm6[4],ymm14[4],ymm6[5],ymm14[5] 3336; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 3337; AVX-NEXT: vmovaps 64(%rdx), %ymm2 3338; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3339; AVX-NEXT: vmovaps 64(%rcx), %ymm1 3340; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3341; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] 3342; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 3343; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] 3344; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 3345; AVX-NEXT: vmovaps 64(%r8), %ymm1 3346; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3347; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 3348; AVX-NEXT: vbroadcastss 80(%r9), %ymm1 3349; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 3350; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3351; AVX-NEXT: vmovaps 96(%rcx), %xmm9 3352; AVX-NEXT: vmovaps 96(%rdx), %xmm11 3353; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,2],xmm9[1,2] 3354; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3355; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3356; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 3357; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 3358; AVX-NEXT: vmovaps 96(%rsi), %xmm5 3359; AVX-NEXT: vmovaps 96(%rdi), %xmm4 3360; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] 3361; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3362; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 3363; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] 3364; AVX-NEXT: vbroadcastss 100(%r8), %xmm3 3365; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] 3366; AVX-NEXT: vbroadcastss 100(%r9), %ymm3 3367; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] 3368; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3369; AVX-NEXT: vmovaps 96(%rdi), %ymm7 3370; AVX-NEXT: vmovaps 96(%rsi), %ymm3 3371; AVX-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[4],ymm3[4],ymm7[5],ymm3[5] 3372; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,2,3] 3373; AVX-NEXT: vmovaps 96(%rdx), %ymm10 3374; AVX-NEXT: vmovaps 96(%rcx), %ymm2 3375; AVX-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] 3376; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3377; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 3378; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,0] 3379; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm15[2,3],ymm1[4,5,6,7] 3380; AVX-NEXT: vmovaps 96(%r8), %ymm1 3381; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3382; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 3383; AVX-NEXT: vbroadcastss 112(%r9), %ymm15 3384; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] 3385; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3386; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload 3387; AVX-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] 3388; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3389; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 3390; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3391; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm12[1,2],ymm0[5,6],ymm12[5,6] 3392; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 3393; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] 3394; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 3395; AVX-NEXT: vbroadcastss 20(%r8), %xmm15 3396; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] 3397; AVX-NEXT: vbroadcastss 20(%r9), %ymm15 3398; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] 3399; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3400; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3401; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload 3402; AVX-NEXT: # ymm15 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 3403; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm8[1,2],ymm13[1,2],ymm8[5,6],ymm13[5,6] 3404; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 3405; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] 3406; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] 3407; AVX-NEXT: vbroadcastss 52(%r8), %xmm8 3408; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] 3409; AVX-NEXT: vbroadcastss 52(%r9), %ymm8 3410; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3],ymm0[4,5,6,7] 3411; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3412; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm14[2],ymm6[3],ymm14[3],ymm6[6],ymm14[6],ymm6[7],ymm14[7] 3413; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3414; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 3415; AVX-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload 3416; AVX-NEXT: # ymm0 = ymm14[1,2],mem[1,2],ymm14[5,6],mem[5,6] 3417; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 3418; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] 3419; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 3420; AVX-NEXT: vbroadcastss 84(%r8), %xmm6 3421; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] 3422; AVX-NEXT: vbroadcastss 84(%r9), %ymm6 3423; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5,6,7] 3424; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3425; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 3426; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm9[0,0,0,0] 3427; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm11[0,0,0,0] 3428; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 3429; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 3430; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] 3431; AVX-NEXT: vinsertf128 $1, 96(%r8), %ymm0, %ymm0 3432; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 3433; AVX-NEXT: vbroadcastss 96(%r9), %ymm1 3434; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 3435; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3436; AVX-NEXT: vunpckhps {{.*#+}} ymm11 = ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[6],ymm3[6],ymm7[7],ymm3[7] 3437; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,2],ymm2[1,2],ymm10[5,6],ymm2[5,6] 3438; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 3439; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] 3440; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] 3441; AVX-NEXT: vbroadcastss 116(%r8), %xmm1 3442; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 3443; AVX-NEXT: vbroadcastss 116(%r9), %ymm1 3444; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 3445; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3446; AVX-NEXT: vbroadcastss (%rcx), %xmm0 3447; AVX-NEXT: vbroadcastss (%rdx), %xmm1 3448; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3449; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3450; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3451; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 3452; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 3453; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] 3454; AVX-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 3455; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] 3456; AVX-NEXT: vbroadcastss (%r9), %ymm1 3457; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 3458; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3459; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3460; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] 3461; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] 3462; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 3463; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 3464; AVX-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,1,3,3] 3465; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 3466; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] 3467; AVX-NEXT: vmovaps (%r9), %xmm3 3468; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm3[0,2,2,3] 3469; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 3470; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] 3471; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload 3472; AVX-NEXT: # ymm3 = ymm12[3,0],mem[3,0],ymm12[7,4],mem[7,4] 3473; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] 3474; AVX-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 3475; AVX-NEXT: # ymm3 = mem[2,3],ymm3[2,3] 3476; AVX-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 3477; AVX-NEXT: # ymm4 = mem[2,3,2,3] 3478; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] 3479; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] 3480; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3] 3481; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] 3482; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] 3483; AVX-NEXT: vbroadcastss 32(%rcx), %xmm4 3484; AVX-NEXT: vbroadcastss 32(%rdx), %xmm6 3485; AVX-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] 3486; AVX-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload 3487; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm6 # 16-byte Folded Reload 3488; AVX-NEXT: # xmm6 = xmm5[0],mem[0],xmm5[1],mem[1] 3489; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 3490; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] 3491; AVX-NEXT: vinsertf128 $1, 32(%r8), %ymm6, %ymm6 3492; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] 3493; AVX-NEXT: vbroadcastss 32(%r9), %ymm6 3494; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7] 3495; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3496; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload 3497; AVX-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] 3498; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] 3499; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3500; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 3501; AVX-NEXT: vpermilps {{.*#+}} xmm7 = mem[2,1,3,3] 3502; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm7, %ymm7 3503; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] 3504; AVX-NEXT: vmovaps 32(%r9), %xmm7 3505; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm7[0,2,2,3] 3506; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 3507; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] 3508; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload 3509; AVX-NEXT: # ymm6 = ymm13[3,0],mem[3,0],ymm13[7,4],mem[7,4] 3510; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] 3511; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm15[2,3],ymm6[2,3] 3512; AVX-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload 3513; AVX-NEXT: # ymm8 = mem[2,3,2,3] 3514; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] 3515; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5],ymm8[6,7] 3516; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] 3517; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] 3518; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4,5,6],ymm8[7] 3519; AVX-NEXT: vbroadcastss 64(%rcx), %xmm8 3520; AVX-NEXT: vbroadcastss 64(%rdx), %xmm9 3521; AVX-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] 3522; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3523; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload 3524; AVX-NEXT: # xmm9 = xmm5[0],mem[0],xmm5[1],mem[1] 3525; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm12 3526; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5,6,7] 3527; AVX-NEXT: vinsertf128 $1, 64(%r8), %ymm9, %ymm9 3528; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] 3529; AVX-NEXT: vbroadcastss 64(%r9), %ymm9 3530; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] 3531; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 3532; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload 3533; AVX-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] 3534; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] 3535; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 3536; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 3537; AVX-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,1,3,3] 3538; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 3539; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3,4,5],ymm12[6,7] 3540; AVX-NEXT: vmovaps 64(%r9), %xmm12 3541; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,2,2,3] 3542; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 3543; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3,4,5,6],ymm12[7] 3544; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3545; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm5[3,0],ymm14[3,0],ymm5[7,4],ymm14[7,4] 3546; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] 3547; AVX-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload 3548; AVX-NEXT: # ymm5 = mem[2,3],ymm12[2,3] 3549; AVX-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload 3550; AVX-NEXT: # ymm12 = mem[2,3,2,3] 3551; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] 3552; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3,4,5],ymm12[6,7] 3553; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3] 3554; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] 3555; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4,5,6],ymm12[7] 3556; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3557; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload 3558; AVX-NEXT: # xmm12 = xmm10[2],mem[2],xmm10[3],mem[3] 3559; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3] 3560; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 3561; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm12 3562; AVX-NEXT: vpermilps {{.*#+}} xmm13 = mem[2,1,3,3] 3563; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13 3564; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] 3565; AVX-NEXT: vmovaps 96(%r9), %xmm13 3566; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm13[0,2,2,3] 3567; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 3568; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] 3569; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload 3570; AVX-NEXT: # ymm10 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] 3571; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] 3572; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],ymm10[2,3] 3573; AVX-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload 3574; AVX-NEXT: # ymm10 = mem[2,3,2,3] 3575; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] 3576; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5],ymm10[6,7] 3577; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3] 3578; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] 3579; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4,5,6],ymm10[7] 3580; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 3581; AVX-NEXT: vmovaps %ymm2, 736(%rax) 3582; AVX-NEXT: vmovaps %ymm12, 640(%rax) 3583; AVX-NEXT: vmovaps %ymm5, 544(%rax) 3584; AVX-NEXT: vmovaps %ymm9, 448(%rax) 3585; AVX-NEXT: vmovaps %ymm8, 384(%rax) 3586; AVX-NEXT: vmovaps %ymm6, 352(%rax) 3587; AVX-NEXT: vmovaps %ymm7, 256(%rax) 3588; AVX-NEXT: vmovaps %ymm4, 192(%rax) 3589; AVX-NEXT: vmovaps %ymm3, 160(%rax) 3590; AVX-NEXT: vmovaps %ymm1, 64(%rax) 3591; AVX-NEXT: vmovaps %ymm0, (%rax) 3592; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3593; AVX-NEXT: vmovaps %ymm0, 704(%rax) 3594; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3595; AVX-NEXT: vmovaps %ymm0, 576(%rax) 3596; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3597; AVX-NEXT: vmovaps %ymm0, 512(%rax) 3598; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3599; AVX-NEXT: vmovaps %ymm0, 320(%rax) 3600; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3601; AVX-NEXT: vmovaps %ymm0, 128(%rax) 3602; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3603; AVX-NEXT: vmovaps %ymm0, 672(%rax) 3604; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3605; AVX-NEXT: vmovaps %ymm0, 608(%rax) 3606; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3607; AVX-NEXT: vmovaps %ymm0, 480(%rax) 3608; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3609; AVX-NEXT: vmovaps %ymm0, 416(%rax) 3610; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3611; AVX-NEXT: vmovaps %ymm0, 288(%rax) 3612; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3613; AVX-NEXT: vmovaps %ymm0, 224(%rax) 3614; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3615; AVX-NEXT: vmovaps %ymm0, 96(%rax) 3616; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3617; AVX-NEXT: vmovaps %ymm0, 32(%rax) 3618; AVX-NEXT: addq $1032, %rsp # imm = 0x408 3619; AVX-NEXT: vzeroupper 3620; AVX-NEXT: retq 3621; 3622; AVX2-LABEL: store_i32_stride6_vf32: 3623; AVX2: # %bb.0: 3624; AVX2-NEXT: subq $904, %rsp # imm = 0x388 3625; AVX2-NEXT: vmovdqa (%rsi), %xmm0 3626; AVX2-NEXT: vmovdqa 32(%rsi), %xmm2 3627; AVX2-NEXT: vmovdqa (%rdi), %xmm1 3628; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3 3629; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3630; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3631; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3632; AVX2-NEXT: vmovdqa (%rcx), %xmm5 3633; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3634; AVX2-NEXT: vmovdqa 32(%rcx), %xmm8 3635; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3636; AVX2-NEXT: vmovdqa 64(%rcx), %xmm7 3637; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3638; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] 3639; AVX2-NEXT: vmovdqa (%rdx), %xmm6 3640; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3641; AVX2-NEXT: vmovdqa 32(%rdx), %xmm9 3642; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3643; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3] 3644; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 3645; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] 3646; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] 3647; AVX2-NEXT: vmovdqa (%r8), %xmm11 3648; AVX2-NEXT: vmovdqa 32(%r8), %xmm12 3649; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm11[0],zero,xmm11[1],zero 3650; AVX2-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3651; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] 3652; AVX2-NEXT: vpbroadcastd 4(%r9), %ymm5 3653; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] 3654; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3655; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] 3656; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3657; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3] 3658; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[1,2,2,3] 3659; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 3660; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm5 3661; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] 3662; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] 3663; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm12[0],zero,xmm12[1],zero 3664; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3665; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] 3666; AVX2-NEXT: vpbroadcastd 36(%r9), %ymm5 3667; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] 3668; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3669; AVX2-NEXT: vmovdqa 64(%rdx), %xmm5 3670; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3671; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] 3672; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] 3673; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 3674; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,2,1] 3675; AVX2-NEXT: vmovdqa 64(%rsi), %xmm4 3676; AVX2-NEXT: vmovdqa 64(%rdi), %xmm5 3677; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] 3678; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3679; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 3680; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] 3681; AVX2-NEXT: vmovdqa 64(%r8), %xmm15 3682; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm15[0],zero,xmm15[1],zero 3683; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3684; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] 3685; AVX2-NEXT: vpbroadcastd 68(%r9), %ymm7 3686; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] 3687; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3688; AVX2-NEXT: vmovdqa 96(%rcx), %xmm6 3689; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3690; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3] 3691; AVX2-NEXT: vmovdqa 96(%rdx), %xmm7 3692; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3693; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,3] 3694; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] 3695; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,1,2,1] 3696; AVX2-NEXT: vmovdqa 96(%rsi), %xmm14 3697; AVX2-NEXT: vmovdqa 96(%rdi), %xmm7 3698; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] 3699; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3700; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 3701; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] 3702; AVX2-NEXT: vmovdqa 96(%r8), %xmm6 3703; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3704; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero 3705; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] 3706; AVX2-NEXT: vpbroadcastd 100(%r9), %ymm9 3707; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] 3708; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3709; AVX2-NEXT: vpbroadcastd (%rcx), %xmm8 3710; AVX2-NEXT: vpbroadcastd (%rdx), %xmm9 3711; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] 3712; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3713; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] 3714; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] 3715; AVX2-NEXT: vpbroadcastq %xmm11, %ymm1 3716; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 3717; AVX2-NEXT: vmovdqa (%r9), %xmm1 3718; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3719; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 3720; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 3721; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3722; AVX2-NEXT: vmovdqa (%rdx), %ymm0 3723; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3724; AVX2-NEXT: vmovdqa (%rcx), %ymm6 3725; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,1,2,2,4,5,6,6] 3726; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[1,1,2,3,5,5,6,7] 3727; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] 3728; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,1,2,3] 3729; AVX2-NEXT: vmovdqa (%rdi), %ymm9 3730; AVX2-NEXT: vmovdqa (%rsi), %ymm8 3731; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] 3732; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3733; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] 3734; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero 3735; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] 3736; AVX2-NEXT: vpbroadcastd 20(%r9), %ymm11 3737; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7] 3738; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3739; AVX2-NEXT: vpbroadcastd 32(%rcx), %xmm10 3740; AVX2-NEXT: vpbroadcastd 32(%rdx), %xmm11 3741; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] 3742; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 3743; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] 3744; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] 3745; AVX2-NEXT: vpbroadcastq %xmm12, %ymm3 3746; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] 3747; AVX2-NEXT: vmovdqa 32(%r9), %xmm0 3748; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3749; AVX2-NEXT: vpbroadcastd %xmm0, %ymm3 3750; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] 3751; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3752; AVX2-NEXT: vmovdqa 32(%rdx), %ymm3 3753; AVX2-NEXT: vmovdqa 32(%rcx), %ymm2 3754; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,2,4,5,6,6] 3755; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[1,1,2,3,5,5,6,7] 3756; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] 3757; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,1,2,3] 3758; AVX2-NEXT: vmovdqa 32(%rdi), %ymm11 3759; AVX2-NEXT: vmovdqa 32(%rsi), %ymm10 3760; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] 3761; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3762; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] 3763; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero 3764; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] 3765; AVX2-NEXT: vpbroadcastd 52(%r9), %ymm13 3766; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] 3767; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3768; AVX2-NEXT: vpbroadcastd 64(%rcx), %xmm12 3769; AVX2-NEXT: vpbroadcastd 64(%rdx), %xmm13 3770; AVX2-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] 3771; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 3772; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] 3773; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5,6,7] 3774; AVX2-NEXT: vpbroadcastq %xmm15, %ymm5 3775; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] 3776; AVX2-NEXT: vmovdqa 64(%r9), %xmm0 3777; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 3778; AVX2-NEXT: vpbroadcastd %xmm0, %ymm5 3779; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] 3780; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3781; AVX2-NEXT: vmovdqa 64(%rdx), %ymm5 3782; AVX2-NEXT: vmovdqa 64(%rcx), %ymm4 3783; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6] 3784; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[1,1,2,3,5,5,6,7] 3785; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7] 3786; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,1,2,3] 3787; AVX2-NEXT: vmovdqa 64(%rdi), %ymm13 3788; AVX2-NEXT: vmovdqa 64(%rsi), %ymm12 3789; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] 3790; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3791; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] 3792; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 3793; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] 3794; AVX2-NEXT: vpbroadcastd 84(%r9), %ymm15 3795; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] 3796; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3797; AVX2-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3798; AVX2-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 3799; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] 3800; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] 3801; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] 3802; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5,6,7] 3803; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload 3804; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] 3805; AVX2-NEXT: vmovdqa 96(%r9), %xmm7 3806; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3807; AVX2-NEXT: vpbroadcastd %xmm7, %ymm7 3808; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] 3809; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3810; AVX2-NEXT: vmovdqa 96(%rdx), %ymm0 3811; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3812; AVX2-NEXT: vmovdqa 96(%rcx), %ymm7 3813; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[0,1,2,2,4,5,6,6] 3814; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] 3815; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] 3816; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 3817; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 3818; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3819; AVX2-NEXT: vmovdqa 96(%rsi), %ymm0 3820; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3821; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] 3822; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3823; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] 3824; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 3825; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 3826; AVX2-NEXT: vpbroadcastd 116(%r9), %ymm15 3827; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] 3828; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3829; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3830; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 3831; AVX2-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] 3832; AVX2-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] 3833; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 3834; AVX2-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 3835; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 3836; AVX2-NEXT: # xmm15 = mem[2,2,3,3] 3837; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] 3838; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] 3839; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 3840; AVX2-NEXT: # xmm15 = mem[2,2,3,3] 3841; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] 3842; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] 3843; AVX2-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] 3844; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3845; AVX2-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5] 3846; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] 3847; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] 3848; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] 3849; AVX2-NEXT: vmovdqa (%r8), %ymm9 3850; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] 3851; AVX2-NEXT: vpbroadcastd 16(%r9), %ymm14 3852; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5],ymm8[6,7] 3853; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] 3854; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] 3855; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 3856; AVX2-NEXT: # ymm1 = mem[2,3],ymm1[2,3] 3857; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[2,1,3,3,6,5,7,7] 3858; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] 3859; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] 3860; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] 3861; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] 3862; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] 3863; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3864; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3865; AVX2-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] 3866; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] 3867; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 3868; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 3869; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 3870; AVX2-NEXT: # xmm9 = mem[2,2,3,3] 3871; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] 3872; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5],ymm9[6,7] 3873; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 3874; AVX2-NEXT: # xmm9 = mem[2,2,3,3] 3875; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] 3876; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6],ymm9[7] 3877; AVX2-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] 3878; AVX2-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 3879; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] 3880; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] 3881; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] 3882; AVX2-NEXT: vmovdqa 32(%r8), %ymm10 3883; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] 3884; AVX2-NEXT: vpbroadcastd 48(%r9), %ymm11 3885; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] 3886; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] 3887; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] 3888; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 3889; AVX2-NEXT: # ymm2 = mem[2,3],ymm2[2,3] 3890; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,1,3,3,6,5,7,7] 3891; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] 3892; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] 3893; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = mem[0,2,2,3,4,6,6,7] 3894; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] 3895; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7] 3896; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 3897; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 3898; AVX2-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] 3899; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] 3900; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 3901; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 3902; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 3903; AVX2-NEXT: # xmm10 = mem[2,2,3,3] 3904; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] 3905; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7] 3906; AVX2-NEXT: vpermilps $250, (%rsp), %xmm10 # 16-byte Folded Reload 3907; AVX2-NEXT: # xmm10 = mem[2,2,3,3] 3908; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] 3909; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] 3910; AVX2-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] 3911; AVX2-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] 3912; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] 3913; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] 3914; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] 3915; AVX2-NEXT: vmovdqa 64(%r8), %ymm11 3916; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] 3917; AVX2-NEXT: vpbroadcastd 80(%r9), %ymm12 3918; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] 3919; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] 3920; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] 3921; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 3922; AVX2-NEXT: # ymm4 = mem[2,3],ymm4[2,3] 3923; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[2,1,3,3,6,5,7,7] 3924; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] 3925; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] 3926; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = mem[0,2,2,3,4,6,6,7] 3927; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] 3928; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6],ymm5[7] 3929; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3930; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload 3931; AVX2-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] 3932; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] 3933; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 3934; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 3935; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 3936; AVX2-NEXT: # xmm11 = mem[2,2,3,3] 3937; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] 3938; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3,4,5],ymm11[6,7] 3939; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 3940; AVX2-NEXT: # xmm11 = mem[2,2,3,3] 3941; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] 3942; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4,5,6],ymm11[7] 3943; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3944; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload 3945; AVX2-NEXT: # ymm11 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 3946; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3947; AVX2-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[4],ymm7[4],ymm0[5],ymm7[5] 3948; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] 3949; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] 3950; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] 3951; AVX2-NEXT: vmovdqa 96(%r8), %ymm12 3952; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] 3953; AVX2-NEXT: vpbroadcastd 112(%r9), %ymm13 3954; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] 3955; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7] 3956; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 3957; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3958; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 3959; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,1,3,3,6,5,7,7] 3960; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] 3961; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5],ymm7[6,7] 3962; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = mem[0,2,2,3,4,6,6,7] 3963; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] 3964; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6],ymm7[7] 3965; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 3966; AVX2-NEXT: vmovdqa %ymm0, 736(%rax) 3967; AVX2-NEXT: vmovdqa %ymm11, 672(%rax) 3968; AVX2-NEXT: vmovaps %ymm5, 640(%rax) 3969; AVX2-NEXT: vmovdqa %ymm4, 544(%rax) 3970; AVX2-NEXT: vmovdqa %ymm10, 480(%rax) 3971; AVX2-NEXT: vmovaps %ymm3, 448(%rax) 3972; AVX2-NEXT: vmovdqa %ymm2, 352(%rax) 3973; AVX2-NEXT: vmovdqa %ymm9, 288(%rax) 3974; AVX2-NEXT: vmovaps %ymm1, 256(%rax) 3975; AVX2-NEXT: vmovdqa %ymm6, 160(%rax) 3976; AVX2-NEXT: vmovdqa %ymm8, 96(%rax) 3977; AVX2-NEXT: vmovaps %ymm15, 64(%rax) 3978; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3979; AVX2-NEXT: vmovaps %ymm0, 704(%rax) 3980; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3981; AVX2-NEXT: vmovaps %ymm0, 576(%rax) 3982; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3983; AVX2-NEXT: vmovaps %ymm0, 512(%rax) 3984; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3985; AVX2-NEXT: vmovaps %ymm0, 384(%rax) 3986; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3987; AVX2-NEXT: vmovaps %ymm0, 320(%rax) 3988; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3989; AVX2-NEXT: vmovaps %ymm0, 192(%rax) 3990; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3991; AVX2-NEXT: vmovaps %ymm0, 128(%rax) 3992; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3993; AVX2-NEXT: vmovaps %ymm0, (%rax) 3994; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3995; AVX2-NEXT: vmovaps %ymm0, 608(%rax) 3996; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3997; AVX2-NEXT: vmovaps %ymm0, 416(%rax) 3998; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3999; AVX2-NEXT: vmovaps %ymm0, 224(%rax) 4000; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4001; AVX2-NEXT: vmovaps %ymm0, 32(%rax) 4002; AVX2-NEXT: addq $904, %rsp # imm = 0x388 4003; AVX2-NEXT: vzeroupper 4004; AVX2-NEXT: retq 4005; 4006; AVX2-FP-LABEL: store_i32_stride6_vf32: 4007; AVX2-FP: # %bb.0: 4008; AVX2-FP-NEXT: subq $904, %rsp # imm = 0x388 4009; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm0 4010; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm2 4011; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1 4012; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm3 4013; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4014; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4015; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 4016; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm5 4017; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4018; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm8 4019; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4020; AVX2-FP-NEXT: vmovdqa 64(%rcx), %xmm7 4021; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4022; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] 4023; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm6 4024; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4025; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm9 4026; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4027; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3] 4028; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 4029; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] 4030; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] 4031; AVX2-FP-NEXT: vmovdqa (%r8), %xmm11 4032; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm12 4033; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm11[0],zero,xmm11[1],zero 4034; AVX2-FP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4035; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] 4036; AVX2-FP-NEXT: vpbroadcastd 4(%r9), %ymm5 4037; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] 4038; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4039; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] 4040; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4041; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3] 4042; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[1,2,2,3] 4043; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 4044; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm5 4045; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] 4046; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] 4047; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm12[0],zero,xmm12[1],zero 4048; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4049; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] 4050; AVX2-FP-NEXT: vpbroadcastd 36(%r9), %ymm5 4051; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] 4052; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4053; AVX2-FP-NEXT: vmovdqa 64(%rdx), %xmm5 4054; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4055; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] 4056; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] 4057; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 4058; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,2,1] 4059; AVX2-FP-NEXT: vmovdqa 64(%rsi), %xmm4 4060; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm5 4061; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] 4062; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4063; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 4064; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] 4065; AVX2-FP-NEXT: vmovdqa 64(%r8), %xmm15 4066; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm15[0],zero,xmm15[1],zero 4067; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4068; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] 4069; AVX2-FP-NEXT: vpbroadcastd 68(%r9), %ymm7 4070; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] 4071; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4072; AVX2-FP-NEXT: vmovdqa 96(%rcx), %xmm6 4073; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4074; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3] 4075; AVX2-FP-NEXT: vmovdqa 96(%rdx), %xmm7 4076; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4077; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,3] 4078; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] 4079; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,1,2,1] 4080; AVX2-FP-NEXT: vmovdqa 96(%rsi), %xmm14 4081; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm7 4082; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] 4083; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4084; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 4085; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] 4086; AVX2-FP-NEXT: vmovdqa 96(%r8), %xmm6 4087; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4088; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero 4089; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] 4090; AVX2-FP-NEXT: vpbroadcastd 100(%r9), %ymm9 4091; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] 4092; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4093; AVX2-FP-NEXT: vpbroadcastd (%rcx), %xmm8 4094; AVX2-FP-NEXT: vpbroadcastd (%rdx), %xmm9 4095; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] 4096; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4097; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] 4098; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] 4099; AVX2-FP-NEXT: vpbroadcastq %xmm11, %ymm1 4100; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 4101; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1 4102; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4103; AVX2-FP-NEXT: vpbroadcastd %xmm1, %ymm1 4104; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 4105; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4106; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm0 4107; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4108; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm6 4109; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,1,2,2,4,5,6,6] 4110; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[1,1,2,3,5,5,6,7] 4111; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] 4112; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,1,2,3] 4113; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm9 4114; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm8 4115; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] 4116; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4117; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] 4118; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero 4119; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] 4120; AVX2-FP-NEXT: vpbroadcastd 20(%r9), %ymm11 4121; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7] 4122; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4123; AVX2-FP-NEXT: vpbroadcastd 32(%rcx), %xmm10 4124; AVX2-FP-NEXT: vpbroadcastd 32(%rdx), %xmm11 4125; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] 4126; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 4127; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] 4128; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] 4129; AVX2-FP-NEXT: vpbroadcastq %xmm12, %ymm3 4130; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] 4131; AVX2-FP-NEXT: vmovdqa 32(%r9), %xmm0 4132; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4133; AVX2-FP-NEXT: vpbroadcastd %xmm0, %ymm3 4134; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] 4135; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4136; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm3 4137; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm2 4138; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,2,4,5,6,6] 4139; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[1,1,2,3,5,5,6,7] 4140; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] 4141; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,1,2,3] 4142; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm11 4143; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm10 4144; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] 4145; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4146; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] 4147; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero 4148; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] 4149; AVX2-FP-NEXT: vpbroadcastd 52(%r9), %ymm13 4150; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] 4151; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4152; AVX2-FP-NEXT: vpbroadcastd 64(%rcx), %xmm12 4153; AVX2-FP-NEXT: vpbroadcastd 64(%rdx), %xmm13 4154; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] 4155; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 4156; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] 4157; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5,6,7] 4158; AVX2-FP-NEXT: vpbroadcastq %xmm15, %ymm5 4159; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] 4160; AVX2-FP-NEXT: vmovdqa 64(%r9), %xmm0 4161; AVX2-FP-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4162; AVX2-FP-NEXT: vpbroadcastd %xmm0, %ymm5 4163; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] 4164; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4165; AVX2-FP-NEXT: vmovdqa 64(%rdx), %ymm5 4166; AVX2-FP-NEXT: vmovdqa 64(%rcx), %ymm4 4167; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6] 4168; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[1,1,2,3,5,5,6,7] 4169; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7] 4170; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,1,2,3] 4171; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm13 4172; AVX2-FP-NEXT: vmovdqa 64(%rsi), %ymm12 4173; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] 4174; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4175; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] 4176; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 4177; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] 4178; AVX2-FP-NEXT: vpbroadcastd 84(%r9), %ymm15 4179; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] 4180; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4181; AVX2-FP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4182; AVX2-FP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 4183; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] 4184; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] 4185; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] 4186; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5,6,7] 4187; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload 4188; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] 4189; AVX2-FP-NEXT: vmovdqa 96(%r9), %xmm7 4190; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4191; AVX2-FP-NEXT: vpbroadcastd %xmm7, %ymm7 4192; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] 4193; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4194; AVX2-FP-NEXT: vmovdqa 96(%rdx), %ymm0 4195; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4196; AVX2-FP-NEXT: vmovdqa 96(%rcx), %ymm7 4197; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[0,1,2,2,4,5,6,6] 4198; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] 4199; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] 4200; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 4201; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 4202; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4203; AVX2-FP-NEXT: vmovdqa 96(%rsi), %ymm0 4204; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4205; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] 4206; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4207; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] 4208; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 4209; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 4210; AVX2-FP-NEXT: vpbroadcastd 116(%r9), %ymm15 4211; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] 4212; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4213; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 4214; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 4215; AVX2-FP-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] 4216; AVX2-FP-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] 4217; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 4218; AVX2-FP-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 4219; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 4220; AVX2-FP-NEXT: # xmm15 = mem[2,2,3,3] 4221; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] 4222; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] 4223; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 4224; AVX2-FP-NEXT: # xmm15 = mem[2,2,3,3] 4225; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] 4226; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] 4227; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] 4228; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4229; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5] 4230; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] 4231; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] 4232; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] 4233; AVX2-FP-NEXT: vmovdqa (%r8), %ymm9 4234; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] 4235; AVX2-FP-NEXT: vpbroadcastd 16(%r9), %ymm14 4236; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5],ymm8[6,7] 4237; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] 4238; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] 4239; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 4240; AVX2-FP-NEXT: # ymm1 = mem[2,3],ymm1[2,3] 4241; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[2,1,3,3,6,5,7,7] 4242; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] 4243; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] 4244; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] 4245; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] 4246; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] 4247; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4248; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 4249; AVX2-FP-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] 4250; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] 4251; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 4252; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 4253; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 4254; AVX2-FP-NEXT: # xmm9 = mem[2,2,3,3] 4255; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] 4256; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5],ymm9[6,7] 4257; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 4258; AVX2-FP-NEXT: # xmm9 = mem[2,2,3,3] 4259; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] 4260; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6],ymm9[7] 4261; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] 4262; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 4263; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] 4264; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] 4265; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] 4266; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm10 4267; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] 4268; AVX2-FP-NEXT: vpbroadcastd 48(%r9), %ymm11 4269; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] 4270; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] 4271; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] 4272; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 4273; AVX2-FP-NEXT: # ymm2 = mem[2,3],ymm2[2,3] 4274; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,1,3,3,6,5,7,7] 4275; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] 4276; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] 4277; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = mem[0,2,2,3,4,6,6,7] 4278; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] 4279; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7] 4280; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4281; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 4282; AVX2-FP-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] 4283; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] 4284; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 4285; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 4286; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 4287; AVX2-FP-NEXT: # xmm10 = mem[2,2,3,3] 4288; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] 4289; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7] 4290; AVX2-FP-NEXT: vpermilps $250, (%rsp), %xmm10 # 16-byte Folded Reload 4291; AVX2-FP-NEXT: # xmm10 = mem[2,2,3,3] 4292; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] 4293; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] 4294; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] 4295; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] 4296; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] 4297; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] 4298; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] 4299; AVX2-FP-NEXT: vmovdqa 64(%r8), %ymm11 4300; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] 4301; AVX2-FP-NEXT: vpbroadcastd 80(%r9), %ymm12 4302; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] 4303; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] 4304; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] 4305; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 4306; AVX2-FP-NEXT: # ymm4 = mem[2,3],ymm4[2,3] 4307; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[2,1,3,3,6,5,7,7] 4308; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] 4309; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] 4310; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = mem[0,2,2,3,4,6,6,7] 4311; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] 4312; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6],ymm5[7] 4313; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 4314; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload 4315; AVX2-FP-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] 4316; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] 4317; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 4318; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 4319; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 4320; AVX2-FP-NEXT: # xmm11 = mem[2,2,3,3] 4321; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] 4322; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3,4,5],ymm11[6,7] 4323; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 4324; AVX2-FP-NEXT: # xmm11 = mem[2,2,3,3] 4325; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] 4326; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4,5,6],ymm11[7] 4327; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4328; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload 4329; AVX2-FP-NEXT: # ymm11 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 4330; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4331; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[4],ymm7[4],ymm0[5],ymm7[5] 4332; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] 4333; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] 4334; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] 4335; AVX2-FP-NEXT: vmovdqa 96(%r8), %ymm12 4336; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] 4337; AVX2-FP-NEXT: vpbroadcastd 112(%r9), %ymm13 4338; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] 4339; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7] 4340; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 4341; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4342; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 4343; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,1,3,3,6,5,7,7] 4344; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] 4345; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5],ymm7[6,7] 4346; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = mem[0,2,2,3,4,6,6,7] 4347; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] 4348; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6],ymm7[7] 4349; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4350; AVX2-FP-NEXT: vmovdqa %ymm0, 736(%rax) 4351; AVX2-FP-NEXT: vmovdqa %ymm11, 672(%rax) 4352; AVX2-FP-NEXT: vmovaps %ymm5, 640(%rax) 4353; AVX2-FP-NEXT: vmovdqa %ymm4, 544(%rax) 4354; AVX2-FP-NEXT: vmovdqa %ymm10, 480(%rax) 4355; AVX2-FP-NEXT: vmovaps %ymm3, 448(%rax) 4356; AVX2-FP-NEXT: vmovdqa %ymm2, 352(%rax) 4357; AVX2-FP-NEXT: vmovdqa %ymm9, 288(%rax) 4358; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rax) 4359; AVX2-FP-NEXT: vmovdqa %ymm6, 160(%rax) 4360; AVX2-FP-NEXT: vmovdqa %ymm8, 96(%rax) 4361; AVX2-FP-NEXT: vmovaps %ymm15, 64(%rax) 4362; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4363; AVX2-FP-NEXT: vmovaps %ymm0, 704(%rax) 4364; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4365; AVX2-FP-NEXT: vmovaps %ymm0, 576(%rax) 4366; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4367; AVX2-FP-NEXT: vmovaps %ymm0, 512(%rax) 4368; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4369; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rax) 4370; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4371; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rax) 4372; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4373; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rax) 4374; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4375; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rax) 4376; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4377; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) 4378; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4379; AVX2-FP-NEXT: vmovaps %ymm0, 608(%rax) 4380; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4381; AVX2-FP-NEXT: vmovaps %ymm0, 416(%rax) 4382; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4383; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax) 4384; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4385; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) 4386; AVX2-FP-NEXT: addq $904, %rsp # imm = 0x388 4387; AVX2-FP-NEXT: vzeroupper 4388; AVX2-FP-NEXT: retq 4389; 4390; AVX2-FCP-LABEL: store_i32_stride6_vf32: 4391; AVX2-FCP: # %bb.0: 4392; AVX2-FCP-NEXT: subq $872, %rsp # imm = 0x368 4393; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm2 4394; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm4 4395; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm7 4396; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 4397; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] 4398; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4399; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 4400; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm3 4401; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4402; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm9 4403; AVX2-FCP-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill 4404; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %xmm6 4405; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4406; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] 4407; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm5 4408; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4409; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm10 4410; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4411; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] 4412; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] 4413; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] 4414; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] 4415; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm8 4416; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm13 4417; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero 4418; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] 4419; AVX2-FCP-NEXT: vpbroadcastd 4(%r9), %ymm3 4420; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] 4421; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4422; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] 4423; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4424; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,2,2,3] 4425; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,2,2,3] 4426; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 4427; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 4428; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] 4429; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] 4430; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm13[0],zero,xmm13[1],zero 4431; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] 4432; AVX2-FCP-NEXT: vpbroadcastd 36(%r9), %ymm3 4433; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] 4434; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4435; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 4436; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4437; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,2,2,3] 4438; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] 4439; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 4440; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] 4441; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %xmm6 4442; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm9 4443; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] 4444; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4445; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 4446; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] 4447; AVX2-FCP-NEXT: vmovdqa 64(%r8), %xmm14 4448; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm14[0],zero,xmm14[1],zero 4449; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] 4450; AVX2-FCP-NEXT: vpbroadcastd 68(%r9), %ymm3 4451; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] 4452; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4453; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %xmm15 4454; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,2,2,3] 4455; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4456; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %xmm3 4457; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4458; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] 4459; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 4460; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,2,1] 4461; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %xmm3 4462; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm0 4463; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] 4464; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4465; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm11 4466; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] 4467; AVX2-FCP-NEXT: vmovdqa 96(%r8), %xmm11 4468; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero 4469; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] 4470; AVX2-FCP-NEXT: vpbroadcastd 100(%r9), %ymm12 4471; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7] 4472; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4473; AVX2-FCP-NEXT: vpbroadcastd (%rcx), %xmm10 4474; AVX2-FCP-NEXT: vpbroadcastd (%rdx), %xmm12 4475; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] 4476; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] 4477; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] 4478; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] 4479; AVX2-FCP-NEXT: vpbroadcastq %xmm8, %ymm7 4480; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5],ymm2[6,7] 4481; AVX2-FCP-NEXT: vpbroadcastd (%r9), %ymm7 4482; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] 4483; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4484; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm7 4485; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm5 4486; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,1,2,2,4,5,6,6] 4487; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[1,1,2,3,5,5,6,7] 4488; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] 4489; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] 4490; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 4491; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4492; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm10 4493; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4494; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm10[2],ymm2[3],ymm10[3],ymm2[6],ymm10[6],ymm2[7],ymm10[7] 4495; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4496; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5],ymm8[6,7] 4497; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero 4498; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] 4499; AVX2-FCP-NEXT: vpbroadcastd 20(%r9), %ymm12 4500; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm12[3],ymm8[4,5,6,7] 4501; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4502; AVX2-FCP-NEXT: vpbroadcastd 32(%rcx), %xmm8 4503; AVX2-FCP-NEXT: vpbroadcastd 32(%rdx), %xmm12 4504; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] 4505; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 4506; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] 4507; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3],ymm1[4,5,6,7] 4508; AVX2-FCP-NEXT: vpbroadcastq %xmm13, %ymm4 4509; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] 4510; AVX2-FCP-NEXT: vpbroadcastd 32(%r9), %ymm4 4511; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] 4512; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4513; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm10 4514; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm8 4515; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[0,1,2,2,4,5,6,6] 4516; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7] 4517; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] 4518; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 4519; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 4520; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4521; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm4 4522; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4523; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] 4524; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4525; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] 4526; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero 4527; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] 4528; AVX2-FCP-NEXT: vpbroadcastd 52(%r9), %ymm4 4529; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6,7] 4530; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4531; AVX2-FCP-NEXT: vpbroadcastd 64(%rcx), %xmm1 4532; AVX2-FCP-NEXT: vpbroadcastd 64(%rdx), %xmm4 4533; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 4534; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] 4535; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] 4536; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] 4537; AVX2-FCP-NEXT: vpbroadcastq %xmm14, %ymm4 4538; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] 4539; AVX2-FCP-NEXT: vpbroadcastd 64(%r9), %ymm4 4540; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] 4541; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4542; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %ymm6 4543; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %ymm4 4544; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,2,2,4,5,6,6] 4545; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[1,1,2,3,5,5,6,7] 4546; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm1[1],ymm9[2],ymm1[3],ymm9[4],ymm1[5],ymm9[6],ymm1[7] 4547; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 4548; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 4549; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4550; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %ymm12 4551; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] 4552; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4553; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] 4554; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm9 = mem[0],zero,mem[1],zero 4555; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7] 4556; AVX2-FCP-NEXT: vpbroadcastd 84(%r9), %ymm9 4557; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3],ymm1[4,5,6,7] 4558; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4559; AVX2-FCP-NEXT: vpbroadcastd %xmm15, %xmm1 4560; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 4561; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] 4562; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 4563; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] 4564; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 4565; AVX2-FCP-NEXT: vpbroadcastq %xmm11, %ymm1 4566; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 4567; AVX2-FCP-NEXT: vpbroadcastd 96(%r9), %ymm1 4568; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 4569; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4570; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %ymm3 4571; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %ymm2 4572; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] 4573; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[1,1,2,3,5,5,6,7] 4574; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2],ymm0[3],ymm9[4],ymm0[5],ymm9[6],ymm0[7] 4575; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 4576; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 4577; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4578; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %ymm9 4579; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] 4580; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4581; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 4582; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero 4583; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3],ymm0[4,5,6,7] 4584; AVX2-FCP-NEXT: vpbroadcastd 116(%r9), %ymm13 4585; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3],ymm0[4,5,6,7] 4586; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4587; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4588; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4589; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 4590; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 4591; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4592; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm13 4593; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm0 4594; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3] 4595; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] 4596; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm14 4597; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] 4598; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm14 4599; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm11 4600; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4,5,6],ymm11[7] 4601; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4602; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 4603; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 4604; AVX2-FCP-NEXT: # ymm11 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] 4605; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] 4606; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] 4607; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] 4608; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] 4609; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] 4610; AVX2-FCP-NEXT: vpbroadcastd 16(%r9), %ymm13 4611; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] 4612; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4613; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] 4614; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] 4615; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 4616; AVX2-FCP-NEXT: # ymm5 = mem[2,3],ymm5[2,3] 4617; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] 4618; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm0 4619; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5],ymm0[6,7] 4620; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [4,6,2,3,4,6,6,7] 4621; AVX2-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm5 4622; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7] 4623; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 4624; AVX2-FCP-NEXT: vpunpckhdq (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload 4625; AVX2-FCP-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] 4626; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] 4627; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 4628; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm11, %ymm5 4629; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm11 4630; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm13 4631; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3,4,5],ymm13[6,7] 4632; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm14 4633; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm13 4634; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2,3,4,5,6],ymm13[7] 4635; AVX2-FCP-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill 4636; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 4637; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 4638; AVX2-FCP-NEXT: # ymm5 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[4],mem[4],ymm5[5],mem[5] 4639; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] 4640; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] 4641; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] 4642; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] 4643; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5],ymm5[6,7] 4644; AVX2-FCP-NEXT: vpbroadcastd 48(%r9), %ymm13 4645; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3,4],ymm13[5],ymm5[6,7] 4646; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] 4647; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] 4648; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 4649; AVX2-FCP-NEXT: # ymm5 = mem[2,3],ymm5[2,3] 4650; AVX2-FCP-NEXT: vpermd %ymm11, %ymm7, %ymm8 4651; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7] 4652; AVX2-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm8 4653; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6],ymm8[7] 4654; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 4655; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload 4656; AVX2-FCP-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] 4657; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] 4658; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 4659; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 4660; AVX2-FCP-NEXT: vmovdqa 64(%r8), %ymm10 4661; AVX2-FCP-NEXT: vpermd %ymm10, %ymm1, %ymm11 4662; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5],ymm11[6,7] 4663; AVX2-FCP-NEXT: vmovdqa 64(%r9), %ymm11 4664; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm14 4665; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4,5,6],ymm14[7] 4666; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 4667; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[4],ymm12[4],ymm14[5],ymm12[5] 4668; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[4],ymm4[4],ymm6[5],ymm4[5] 4669; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] 4670; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] 4671; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] 4672; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] 4673; AVX2-FCP-NEXT: vpbroadcastd 80(%r9), %ymm14 4674; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7] 4675; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] 4676; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] 4677; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 4678; AVX2-FCP-NEXT: # ymm4 = mem[2,3],ymm4[2,3] 4679; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm6 4680; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] 4681; AVX2-FCP-NEXT: vpermd %ymm11, %ymm15, %ymm6 4682; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7] 4683; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4684; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload 4685; AVX2-FCP-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] 4686; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] 4687; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 4688; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 4689; AVX2-FCP-NEXT: vmovdqa 96(%r8), %ymm10 4690; AVX2-FCP-NEXT: vpermd %ymm10, %ymm1, %ymm11 4691; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5],ymm11[6,7] 4692; AVX2-FCP-NEXT: vmovdqa 96(%r9), %ymm11 4693; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm1 4694; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3,4,5,6],ymm1[7] 4695; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 4696; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] 4697; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 4698; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] 4699; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] 4700; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7] 4701; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5],ymm6[6,7] 4702; AVX2-FCP-NEXT: vpbroadcastd 112(%r9), %ymm9 4703; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7] 4704; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] 4705; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] 4706; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 4707; AVX2-FCP-NEXT: # ymm2 = mem[2,3],ymm2[2,3] 4708; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm3 4709; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] 4710; AVX2-FCP-NEXT: vpermd %ymm11, %ymm15, %ymm3 4711; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7] 4712; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4713; AVX2-FCP-NEXT: vmovdqa %ymm2, 736(%rax) 4714; AVX2-FCP-NEXT: vmovdqa %ymm6, 672(%rax) 4715; AVX2-FCP-NEXT: vmovdqa %ymm1, 640(%rax) 4716; AVX2-FCP-NEXT: vmovdqa %ymm4, 544(%rax) 4717; AVX2-FCP-NEXT: vmovdqa %ymm12, 480(%rax) 4718; AVX2-FCP-NEXT: vmovdqa %ymm8, 448(%rax) 4719; AVX2-FCP-NEXT: vmovdqa %ymm5, 352(%rax) 4720; AVX2-FCP-NEXT: vmovdqa %ymm13, 288(%rax) 4721; AVX2-FCP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload 4722; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rax) 4723; AVX2-FCP-NEXT: vmovdqa %ymm0, 160(%rax) 4724; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4725; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rax) 4726; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4727; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax) 4728; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4729; AVX2-FCP-NEXT: vmovaps %ymm0, 704(%rax) 4730; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4731; AVX2-FCP-NEXT: vmovaps %ymm0, 576(%rax) 4732; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4733; AVX2-FCP-NEXT: vmovaps %ymm0, 512(%rax) 4734; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4735; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rax) 4736; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4737; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rax) 4738; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4739; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rax) 4740; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4741; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rax) 4742; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4743; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) 4744; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4745; AVX2-FCP-NEXT: vmovaps %ymm0, 608(%rax) 4746; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4747; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%rax) 4748; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4749; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax) 4750; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4751; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) 4752; AVX2-FCP-NEXT: addq $872, %rsp # imm = 0x368 4753; AVX2-FCP-NEXT: vzeroupper 4754; AVX2-FCP-NEXT: retq 4755; 4756; AVX512-LABEL: store_i32_stride6_vf32: 4757; AVX512: # %bb.0: 4758; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 4759; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm5 4760; AVX512-NEXT: vmovdqa64 (%rdi), %zmm17 4761; AVX512-NEXT: vmovdqa64 (%rsi), %zmm18 4762; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm7 4763; AVX512-NEXT: vmovdqa64 (%rdx), %zmm1 4764; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm0 4765; AVX512-NEXT: vmovdqa64 (%rcx), %zmm6 4766; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm10 4767; AVX512-NEXT: vmovdqa64 (%r8), %zmm3 4768; AVX512-NEXT: vmovdqa64 64(%r8), %zmm8 4769; AVX512-NEXT: vmovdqa64 (%r9), %zmm4 4770; AVX512-NEXT: vmovdqa64 64(%r9), %zmm9 4771; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 4772; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 4773; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 4774; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 4775; AVX512-NEXT: vmovdqa64 %zmm5, %zmm11 4776; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm11 4777; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 4778; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 4779; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 4780; AVX512-NEXT: vpermt2d %zmm7, %zmm14, %zmm13 4781; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 4782; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 4783; AVX512-NEXT: vmovdqa64 %zmm5, %zmm15 4784; AVX512-NEXT: vpermt2d %zmm7, %zmm16, %zmm15 4785; AVX512-NEXT: vpermi2d %zmm18, %zmm17, %zmm12 4786; AVX512-NEXT: vpermi2d %zmm18, %zmm17, %zmm14 4787; AVX512-NEXT: vpermi2d %zmm18, %zmm17, %zmm16 4788; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm17[2],zmm18[2],zmm17[3],zmm18[3],zmm17[6],zmm18[6],zmm17[7],zmm18[7],zmm17[10],zmm18[10],zmm17[11],zmm18[11],zmm17[14],zmm18[14],zmm17[15],zmm18[15] 4789; AVX512-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 4790; AVX512-NEXT: vmovdqa64 (%rdx), %ymm18 4791; AVX512-NEXT: vmovdqa64 64(%rdx), %ymm20 4792; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] 4793; AVX512-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 4794; AVX512-NEXT: movb $36, %dl 4795; AVX512-NEXT: kmovw %edx, %k1 4796; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] 4797; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 4798; AVX512-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 4799; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 4800; AVX512-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 4801; AVX512-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 4802; AVX512-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 4803; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] 4804; AVX512-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 4805; AVX512-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 4806; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] 4807; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 4808; AVX512-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 4809; AVX512-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} 4810; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 4811; AVX512-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 4812; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 4813; AVX512-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 4814; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 4815; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 4816; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 4817; AVX512-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 4818; AVX512-NEXT: movb $-110, %cl 4819; AVX512-NEXT: kmovw %ecx, %k2 4820; AVX512-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} 4821; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 4822; AVX512-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 4823; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 4824; AVX512-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 4825; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 4826; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 4827; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 4828; AVX512-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 4829; AVX512-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} 4830; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 4831; AVX512-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 4832; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 4833; AVX512-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 4834; AVX512-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 4835; AVX512-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} 4836; AVX512-NEXT: vpermt2d %zmm3, %zmm20, %zmm12 4837; AVX512-NEXT: vpermt2d %zmm4, %zmm21, %zmm12 4838; AVX512-NEXT: vpermi2d %zmm6, %zmm1, %zmm22 4839; AVX512-NEXT: vmovdqa64 %zmm22, %zmm14 {%k2} 4840; AVX512-NEXT: vpermt2d %zmm3, %zmm23, %zmm14 4841; AVX512-NEXT: vpermt2d %zmm4, %zmm24, %zmm14 4842; AVX512-NEXT: vpermi2d %zmm6, %zmm1, %zmm25 4843; AVX512-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} 4844; AVX512-NEXT: vpermt2d %zmm3, %zmm26, %zmm16 4845; AVX512-NEXT: vpermt2d %zmm4, %zmm27, %zmm16 4846; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 4847; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] 4848; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 4849; AVX512-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 4850; AVX512-NEXT: vmovdqa64 (%rdi), %ymm21 4851; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm22 4852; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] 4853; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] 4854; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 4855; AVX512-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 4856; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 4857; AVX512-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 4858; AVX512-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 4859; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] 4860; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] 4861; AVX512-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 4862; AVX512-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 4863; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 4864; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 4865; AVX512-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 4866; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] 4867; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] 4868; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 4869; AVX512-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 4870; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 4871; AVX512-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 4872; AVX512-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 4873; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] 4874; AVX512-NEXT: vpermt2d %zmm3, %zmm5, %zmm1 4875; AVX512-NEXT: vpermt2d %zmm4, %zmm7, %zmm1 4876; AVX512-NEXT: vmovdqa64 %zmm16, (%rax) 4877; AVX512-NEXT: vmovdqa64 %zmm14, 192(%rax) 4878; AVX512-NEXT: vmovdqa64 %zmm1, 320(%rax) 4879; AVX512-NEXT: vmovdqa64 %zmm12, 256(%rax) 4880; AVX512-NEXT: vmovdqa64 %zmm15, 384(%rax) 4881; AVX512-NEXT: vmovdqa64 %zmm13, 576(%rax) 4882; AVX512-NEXT: vmovdqa64 %zmm0, 704(%rax) 4883; AVX512-NEXT: vmovdqa64 %zmm11, 640(%rax) 4884; AVX512-NEXT: vmovdqa64 %zmm18, 128(%rax) 4885; AVX512-NEXT: vmovdqa64 %zmm2, 448(%rax) 4886; AVX512-NEXT: vmovdqa64 %zmm20, 512(%rax) 4887; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rax) 4888; AVX512-NEXT: vzeroupper 4889; AVX512-NEXT: retq 4890; 4891; AVX512-FCP-LABEL: store_i32_stride6_vf32: 4892; AVX512-FCP: # %bb.0: 4893; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 4894; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 4895; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm11 4896; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13 4897; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 4898; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 4899; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 4900; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm24 4901; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 4902; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 4903; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 4904; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 4905; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 4906; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 4907; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 4908; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 4909; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] 4910; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4911; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 4912; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 4913; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 4914; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 4915; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 4916; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 4917; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] 4918; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 4919; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 4920; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 4921; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 4922; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 4923; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 4924; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 4925; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 4926; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 4927; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 4928; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 4929; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 4930; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 4931; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm10, %zmm9 4932; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 4933; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 4934; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 4935; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 4936; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 4937; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 4938; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] 4939; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 4940; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] 4941; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 4942; AVX512-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm5 4943; AVX512-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm17 4944; AVX512-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm21 4945; AVX512-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm16 4946; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm4 4947; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 4948; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 4949; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 4950; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 4951; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 4952; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 4953; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 4954; AVX512-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 4955; AVX512-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 4956; AVX512-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm10 4957; AVX512-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm24 4958; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm1 4959; AVX512-FCP-NEXT: movb $-110, %al 4960; AVX512-FCP-NEXT: kmovw %eax, %k2 4961; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 {%k2} 4962; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm11 4963; AVX512-FCP-NEXT: movb $36, %al 4964; AVX512-FCP-NEXT: kmovw %eax, %k1 4965; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} 4966; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 4967; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} 4968; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 4969; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 4970; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} 4971; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 4972; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 4973; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} 4974; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 4975; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 4976; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} 4977; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 4978; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 4979; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} 4980; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 4981; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 4982; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 4983; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 4984; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 4985; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} 4986; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 4987; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} 4988; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm8 4989; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 {%k2} 4990; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm12 4991; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 4992; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 4993; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} 4994; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 4995; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 4996; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 4997; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 4998; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 4999; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} 5000; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 5001; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 5002; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 5003; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 5004; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 5005; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 5006; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 5007; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 5008; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 5009; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 5010; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 5011; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 5012; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm11, %zmm10 5013; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm16 5014; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm17, %zmm1 5015; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 5016; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) 5017; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 128(%rax) 5018; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) 5019; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) 5020; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) 5021; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 384(%rax) 5022; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 448(%rax) 5023; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 512(%rax) 5024; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) 5025; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 640(%rax) 5026; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 704(%rax) 5027; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) 5028; AVX512-FCP-NEXT: vzeroupper 5029; AVX512-FCP-NEXT: retq 5030; 5031; AVX512DQ-LABEL: store_i32_stride6_vf32: 5032; AVX512DQ: # %bb.0: 5033; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 5034; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm5 5035; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm17 5036; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm18 5037; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm7 5038; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm1 5039; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm0 5040; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm6 5041; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm10 5042; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm3 5043; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm8 5044; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm4 5045; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm9 5046; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 5047; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 5048; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 5049; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 5050; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm11 5051; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm12, %zmm11 5052; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 5053; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 5054; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 5055; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm14, %zmm13 5056; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 5057; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 5058; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm15 5059; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm16, %zmm15 5060; AVX512DQ-NEXT: vpermi2d %zmm18, %zmm17, %zmm12 5061; AVX512DQ-NEXT: vpermi2d %zmm18, %zmm17, %zmm14 5062; AVX512DQ-NEXT: vpermi2d %zmm18, %zmm17, %zmm16 5063; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm17[2],zmm18[2],zmm17[3],zmm18[3],zmm17[6],zmm18[6],zmm17[7],zmm18[7],zmm17[10],zmm18[10],zmm17[11],zmm18[11],zmm17[14],zmm18[14],zmm17[15],zmm18[15] 5064; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 5065; AVX512DQ-NEXT: vmovdqa64 (%rdx), %ymm18 5066; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %ymm20 5067; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] 5068; AVX512DQ-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 5069; AVX512DQ-NEXT: movb $36, %dl 5070; AVX512DQ-NEXT: kmovw %edx, %k1 5071; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] 5072; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 5073; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 5074; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 5075; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 5076; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 5077; AVX512DQ-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 5078; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] 5079; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 5080; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 5081; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] 5082; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 5083; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 5084; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} 5085; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 5086; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 5087; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 5088; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 5089; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 5090; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 5091; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 5092; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 5093; AVX512DQ-NEXT: movb $-110, %cl 5094; AVX512DQ-NEXT: kmovw %ecx, %k2 5095; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} 5096; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 5097; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 5098; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 5099; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 5100; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 5101; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 5102; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 5103; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 5104; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} 5105; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 5106; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 5107; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 5108; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 5109; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 5110; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} 5111; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm20, %zmm12 5112; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm21, %zmm12 5113; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm1, %zmm22 5114; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm14 {%k2} 5115; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm23, %zmm14 5116; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm24, %zmm14 5117; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm1, %zmm25 5118; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} 5119; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm26, %zmm16 5120; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm27, %zmm16 5121; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 5122; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] 5123; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 5124; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 5125; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm21 5126; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm22 5127; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] 5128; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] 5129; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 5130; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 5131; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 5132; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 5133; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 5134; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] 5135; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] 5136; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 5137; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 5138; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 5139; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 5140; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 5141; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] 5142; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] 5143; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 5144; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 5145; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 5146; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 5147; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 5148; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] 5149; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm5, %zmm1 5150; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm7, %zmm1 5151; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rax) 5152; AVX512DQ-NEXT: vmovdqa64 %zmm14, 192(%rax) 5153; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rax) 5154; AVX512DQ-NEXT: vmovdqa64 %zmm12, 256(%rax) 5155; AVX512DQ-NEXT: vmovdqa64 %zmm15, 384(%rax) 5156; AVX512DQ-NEXT: vmovdqa64 %zmm13, 576(%rax) 5157; AVX512DQ-NEXT: vmovdqa64 %zmm0, 704(%rax) 5158; AVX512DQ-NEXT: vmovdqa64 %zmm11, 640(%rax) 5159; AVX512DQ-NEXT: vmovdqa64 %zmm18, 128(%rax) 5160; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rax) 5161; AVX512DQ-NEXT: vmovdqa64 %zmm20, 512(%rax) 5162; AVX512DQ-NEXT: vmovdqa64 %zmm17, 64(%rax) 5163; AVX512DQ-NEXT: vzeroupper 5164; AVX512DQ-NEXT: retq 5165; 5166; AVX512DQ-FCP-LABEL: store_i32_stride6_vf32: 5167; AVX512DQ-FCP: # %bb.0: 5168; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 5169; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 5170; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm11 5171; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13 5172; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 5173; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 5174; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 5175; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm24 5176; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 5177; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 5178; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 5179; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 5180; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 5181; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 5182; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 5183; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 5184; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] 5185; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5186; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 5187; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 5188; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 5189; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 5190; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 5191; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 5192; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] 5193; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 5194; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 5195; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 5196; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 5197; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 5198; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 5199; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 5200; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 5201; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 5202; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 5203; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 5204; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 5205; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 5206; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm10, %zmm9 5207; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 5208; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 5209; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 5210; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 5211; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 5212; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 5213; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] 5214; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 5215; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] 5216; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5217; AVX512DQ-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm5 5218; AVX512DQ-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm17 5219; AVX512DQ-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm21 5220; AVX512DQ-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm16 5221; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm4 5222; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 5223; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 5224; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 5225; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 5226; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 5227; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 5228; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 5229; AVX512DQ-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 5230; AVX512DQ-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 5231; AVX512DQ-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm10 5232; AVX512DQ-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm24 5233; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm1 5234; AVX512DQ-FCP-NEXT: movb $-110, %al 5235; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 5236; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 {%k2} 5237; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm11 5238; AVX512DQ-FCP-NEXT: movb $36, %al 5239; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 5240; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} 5241; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 5242; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} 5243; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 5244; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 5245; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} 5246; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 5247; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 5248; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} 5249; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 5250; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 5251; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} 5252; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 5253; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 5254; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} 5255; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 5256; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 5257; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 5258; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 5259; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 5260; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} 5261; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 5262; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} 5263; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm8 5264; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 {%k2} 5265; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm12 5266; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 5267; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 5268; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} 5269; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 5270; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 5271; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 5272; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 5273; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 5274; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} 5275; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 5276; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 5277; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 5278; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 5279; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 5280; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 5281; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 5282; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 5283; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 5284; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 5285; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 5286; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 5287; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm11, %zmm10 5288; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm16 5289; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm17, %zmm1 5290; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 5291; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) 5292; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 128(%rax) 5293; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) 5294; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) 5295; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) 5296; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 384(%rax) 5297; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 448(%rax) 5298; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 512(%rax) 5299; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) 5300; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 640(%rax) 5301; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 704(%rax) 5302; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) 5303; AVX512DQ-FCP-NEXT: vzeroupper 5304; AVX512DQ-FCP-NEXT: retq 5305; 5306; AVX512BW-LABEL: store_i32_stride6_vf32: 5307; AVX512BW: # %bb.0: 5308; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 5309; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 5310; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm17 5311; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm18 5312; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm7 5313; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 5314; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm0 5315; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 5316; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm10 5317; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm3 5318; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm8 5319; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm4 5320; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm9 5321; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 5322; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 5323; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 5324; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 5325; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 5326; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm11 5327; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 5328; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 5329; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 5330; AVX512BW-NEXT: vpermt2d %zmm7, %zmm14, %zmm13 5331; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 5332; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 5333; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 5334; AVX512BW-NEXT: vpermt2d %zmm7, %zmm16, %zmm15 5335; AVX512BW-NEXT: vpermi2d %zmm18, %zmm17, %zmm12 5336; AVX512BW-NEXT: vpermi2d %zmm18, %zmm17, %zmm14 5337; AVX512BW-NEXT: vpermi2d %zmm18, %zmm17, %zmm16 5338; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm17[2],zmm18[2],zmm17[3],zmm18[3],zmm17[6],zmm18[6],zmm17[7],zmm18[7],zmm17[10],zmm18[10],zmm17[11],zmm18[11],zmm17[14],zmm18[14],zmm17[15],zmm18[15] 5339; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 5340; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm18 5341; AVX512BW-NEXT: vmovdqa64 64(%rdx), %ymm20 5342; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] 5343; AVX512BW-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 5344; AVX512BW-NEXT: movb $36, %dl 5345; AVX512BW-NEXT: kmovd %edx, %k1 5346; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] 5347; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 5348; AVX512BW-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 5349; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 5350; AVX512BW-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 5351; AVX512BW-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 5352; AVX512BW-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 5353; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] 5354; AVX512BW-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 5355; AVX512BW-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 5356; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] 5357; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 5358; AVX512BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 5359; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} 5360; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 5361; AVX512BW-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 5362; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 5363; AVX512BW-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 5364; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 5365; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 5366; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 5367; AVX512BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 5368; AVX512BW-NEXT: movb $-110, %cl 5369; AVX512BW-NEXT: kmovd %ecx, %k2 5370; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} 5371; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 5372; AVX512BW-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 5373; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 5374; AVX512BW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 5375; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 5376; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 5377; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 5378; AVX512BW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 5379; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} 5380; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 5381; AVX512BW-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 5382; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 5383; AVX512BW-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 5384; AVX512BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 5385; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} 5386; AVX512BW-NEXT: vpermt2d %zmm3, %zmm20, %zmm12 5387; AVX512BW-NEXT: vpermt2d %zmm4, %zmm21, %zmm12 5388; AVX512BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm22 5389; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm14 {%k2} 5390; AVX512BW-NEXT: vpermt2d %zmm3, %zmm23, %zmm14 5391; AVX512BW-NEXT: vpermt2d %zmm4, %zmm24, %zmm14 5392; AVX512BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm25 5393; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} 5394; AVX512BW-NEXT: vpermt2d %zmm3, %zmm26, %zmm16 5395; AVX512BW-NEXT: vpermt2d %zmm4, %zmm27, %zmm16 5396; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 5397; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] 5398; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 5399; AVX512BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 5400; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm21 5401; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm22 5402; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] 5403; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] 5404; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 5405; AVX512BW-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 5406; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 5407; AVX512BW-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 5408; AVX512BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 5409; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] 5410; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] 5411; AVX512BW-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 5412; AVX512BW-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 5413; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 5414; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 5415; AVX512BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 5416; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] 5417; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] 5418; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 5419; AVX512BW-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 5420; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 5421; AVX512BW-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 5422; AVX512BW-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 5423; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] 5424; AVX512BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm1 5425; AVX512BW-NEXT: vpermt2d %zmm4, %zmm7, %zmm1 5426; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rax) 5427; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rax) 5428; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%rax) 5429; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%rax) 5430; AVX512BW-NEXT: vmovdqa64 %zmm15, 384(%rax) 5431; AVX512BW-NEXT: vmovdqa64 %zmm13, 576(%rax) 5432; AVX512BW-NEXT: vmovdqa64 %zmm0, 704(%rax) 5433; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rax) 5434; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%rax) 5435; AVX512BW-NEXT: vmovdqa64 %zmm2, 448(%rax) 5436; AVX512BW-NEXT: vmovdqa64 %zmm20, 512(%rax) 5437; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) 5438; AVX512BW-NEXT: vzeroupper 5439; AVX512BW-NEXT: retq 5440; 5441; AVX512BW-FCP-LABEL: store_i32_stride6_vf32: 5442; AVX512BW-FCP: # %bb.0: 5443; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 5444; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 5445; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm11 5446; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13 5447; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 5448; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 5449; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 5450; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm24 5451; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 5452; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 5453; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 5454; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 5455; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 5456; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 5457; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 5458; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 5459; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] 5460; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5461; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 5462; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 5463; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 5464; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 5465; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 5466; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 5467; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] 5468; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 5469; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 5470; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 5471; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 5472; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 5473; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 5474; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 5475; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 5476; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 5477; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 5478; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 5479; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 5480; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 5481; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm10, %zmm9 5482; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 5483; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 5484; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 5485; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 5486; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 5487; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 5488; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] 5489; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 5490; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] 5491; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5492; AVX512BW-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm5 5493; AVX512BW-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm17 5494; AVX512BW-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm21 5495; AVX512BW-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm16 5496; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm4 5497; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 5498; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 5499; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 5500; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 5501; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 5502; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 5503; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 5504; AVX512BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 5505; AVX512BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 5506; AVX512BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm10 5507; AVX512BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm24 5508; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm1 5509; AVX512BW-FCP-NEXT: movb $-110, %al 5510; AVX512BW-FCP-NEXT: kmovd %eax, %k2 5511; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 {%k2} 5512; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm11 5513; AVX512BW-FCP-NEXT: movb $36, %al 5514; AVX512BW-FCP-NEXT: kmovd %eax, %k1 5515; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} 5516; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 5517; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} 5518; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 5519; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 5520; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} 5521; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 5522; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 5523; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} 5524; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 5525; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 5526; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} 5527; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 5528; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 5529; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} 5530; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 5531; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 5532; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 5533; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 5534; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 5535; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} 5536; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 5537; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} 5538; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm8 5539; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 {%k2} 5540; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm12 5541; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 5542; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 5543; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} 5544; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 5545; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 5546; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 5547; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 5548; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 5549; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} 5550; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 5551; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 5552; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 5553; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 5554; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 5555; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 5556; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 5557; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 5558; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 5559; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 5560; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 5561; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 5562; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm11, %zmm10 5563; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm16 5564; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm17, %zmm1 5565; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 5566; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) 5567; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 128(%rax) 5568; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) 5569; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) 5570; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) 5571; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 384(%rax) 5572; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 448(%rax) 5573; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 512(%rax) 5574; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) 5575; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 640(%rax) 5576; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 704(%rax) 5577; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) 5578; AVX512BW-FCP-NEXT: vzeroupper 5579; AVX512BW-FCP-NEXT: retq 5580; 5581; AVX512DQ-BW-LABEL: store_i32_stride6_vf32: 5582; AVX512DQ-BW: # %bb.0: 5583; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 5584; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm5 5585; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm17 5586; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm18 5587; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm7 5588; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm1 5589; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm0 5590; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 5591; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm10 5592; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm3 5593; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm8 5594; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm4 5595; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm9 5596; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 5597; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 5598; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 5599; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 5600; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm11 5601; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm11 5602; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 5603; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 5604; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 5605; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm14, %zmm13 5606; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 5607; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 5608; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm15 5609; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm16, %zmm15 5610; AVX512DQ-BW-NEXT: vpermi2d %zmm18, %zmm17, %zmm12 5611; AVX512DQ-BW-NEXT: vpermi2d %zmm18, %zmm17, %zmm14 5612; AVX512DQ-BW-NEXT: vpermi2d %zmm18, %zmm17, %zmm16 5613; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm17[2],zmm18[2],zmm17[3],zmm18[3],zmm17[6],zmm18[6],zmm17[7],zmm18[7],zmm17[10],zmm18[10],zmm17[11],zmm18[11],zmm17[14],zmm18[14],zmm17[15],zmm18[15] 5614; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 5615; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %ymm18 5616; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %ymm20 5617; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] 5618; AVX512DQ-BW-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 5619; AVX512DQ-BW-NEXT: movb $36, %dl 5620; AVX512DQ-BW-NEXT: kmovd %edx, %k1 5621; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] 5622; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 5623; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 5624; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 5625; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 5626; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 5627; AVX512DQ-BW-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 5628; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] 5629; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 5630; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 5631; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] 5632; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 5633; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 5634; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} 5635; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 5636; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 5637; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 5638; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 5639; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 5640; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 5641; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 5642; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 5643; AVX512DQ-BW-NEXT: movb $-110, %cl 5644; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 5645; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} 5646; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 5647; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 5648; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 5649; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 5650; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 5651; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] 5652; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 5653; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 5654; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} 5655; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 5656; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 5657; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 5658; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 5659; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 5660; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} 5661; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm20, %zmm12 5662; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm21, %zmm12 5663; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm22 5664; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm14 {%k2} 5665; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm23, %zmm14 5666; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm24, %zmm14 5667; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm25 5668; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} 5669; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm26, %zmm16 5670; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm27, %zmm16 5671; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 5672; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] 5673; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 5674; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 5675; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm21 5676; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm22 5677; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] 5678; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] 5679; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 5680; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 5681; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 5682; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 5683; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 5684; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] 5685; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] 5686; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 5687; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 5688; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 5689; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 5690; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 5691; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] 5692; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] 5693; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 5694; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 5695; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 5696; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 5697; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 5698; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] 5699; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm1 5700; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm7, %zmm1 5701; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rax) 5702; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 192(%rax) 5703; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 320(%rax) 5704; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 256(%rax) 5705; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 384(%rax) 5706; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 576(%rax) 5707; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 704(%rax) 5708; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 640(%rax) 5709; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 128(%rax) 5710; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 448(%rax) 5711; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 512(%rax) 5712; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 64(%rax) 5713; AVX512DQ-BW-NEXT: vzeroupper 5714; AVX512DQ-BW-NEXT: retq 5715; 5716; AVX512DQ-BW-FCP-LABEL: store_i32_stride6_vf32: 5717; AVX512DQ-BW-FCP: # %bb.0: 5718; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 5719; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 5720; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm11 5721; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13 5722; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 5723; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 5724; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 5725; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm24 5726; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 5727; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 5728; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 5729; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 5730; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 5731; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 5732; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 5733; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 5734; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] 5735; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5736; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 5737; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 5738; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 5739; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 5740; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 5741; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 5742; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] 5743; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 5744; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 5745; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 5746; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 5747; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 5748; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 5749; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 5750; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 5751; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 5752; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 5753; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 5754; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 5755; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 5756; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm10, %zmm9 5757; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 5758; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 5759; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 5760; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 5761; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 5762; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 5763; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] 5764; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 5765; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] 5766; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 5767; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm5 5768; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm17 5769; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm21 5770; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm16 5771; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm4 5772; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 5773; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 5774; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 5775; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 5776; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 5777; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 5778; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 5779; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 5780; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 5781; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm10 5782; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm24 5783; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm1 5784; AVX512DQ-BW-FCP-NEXT: movb $-110, %al 5785; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 5786; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 {%k2} 5787; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm11 5788; AVX512DQ-BW-FCP-NEXT: movb $36, %al 5789; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 5790; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} 5791; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 5792; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} 5793; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 5794; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 5795; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} 5796; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 5797; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 5798; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} 5799; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 5800; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 5801; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} 5802; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 5803; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 5804; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} 5805; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 5806; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 5807; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 5808; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 5809; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 5810; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} 5811; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 5812; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} 5813; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm8 5814; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 {%k2} 5815; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm12 5816; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 5817; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 5818; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} 5819; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 5820; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 5821; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 5822; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 5823; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 5824; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} 5825; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 5826; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 5827; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 5828; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 5829; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 5830; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 5831; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 5832; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 5833; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 5834; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 5835; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 5836; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 5837; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm11, %zmm10 5838; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm16 5839; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm17, %zmm1 5840; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 5841; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) 5842; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 128(%rax) 5843; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) 5844; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) 5845; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) 5846; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 384(%rax) 5847; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 448(%rax) 5848; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 512(%rax) 5849; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) 5850; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 640(%rax) 5851; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 704(%rax) 5852; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) 5853; AVX512DQ-BW-FCP-NEXT: vzeroupper 5854; AVX512DQ-BW-FCP-NEXT: retq 5855 %in.vec0 = load <32 x i32>, ptr %in.vecptr0, align 64 5856 %in.vec1 = load <32 x i32>, ptr %in.vecptr1, align 64 5857 %in.vec2 = load <32 x i32>, ptr %in.vecptr2, align 64 5858 %in.vec3 = load <32 x i32>, ptr %in.vecptr3, align 64 5859 %in.vec4 = load <32 x i32>, ptr %in.vecptr4, align 64 5860 %in.vec5 = load <32 x i32>, ptr %in.vecptr5, align 64 5861 %1 = shufflevector <32 x i32> %in.vec0, <32 x i32> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 5862 %2 = shufflevector <32 x i32> %in.vec2, <32 x i32> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 5863 %3 = shufflevector <32 x i32> %in.vec4, <32 x i32> %in.vec5, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 5864 %4 = shufflevector <64 x i32> %1, <64 x i32> %2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 5865 %5 = shufflevector <64 x i32> %3, <64 x i32> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 5866 %6 = shufflevector <128 x i32> %4, <128 x i32> %5, <192 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191> 5867 %interleaved.vec = shufflevector <192 x i32> %6, <192 x i32> poison, <192 x i32> <i32 0, i32 32, i32 64, i32 96, i32 128, i32 160, i32 1, i32 33, i32 65, i32 97, i32 129, i32 161, i32 2, i32 34, i32 66, i32 98, i32 130, i32 162, i32 3, i32 35, i32 67, i32 99, i32 131, i32 163, i32 4, i32 36, i32 68, i32 100, i32 132, i32 164, i32 5, i32 37, i32 69, i32 101, i32 133, i32 165, i32 6, i32 38, i32 70, i32 102, i32 134, i32 166, i32 7, i32 39, i32 71, i32 103, i32 135, i32 167, i32 8, i32 40, i32 72, i32 104, i32 136, i32 168, i32 9, i32 41, i32 73, i32 105, i32 137, i32 169, i32 10, i32 42, i32 74, i32 106, i32 138, i32 170, i32 11, i32 43, i32 75, i32 107, i32 139, i32 171, i32 12, i32 44, i32 76, i32 108, i32 140, i32 172, i32 13, i32 45, i32 77, i32 109, i32 141, i32 173, i32 14, i32 46, i32 78, i32 110, i32 142, i32 174, i32 15, i32 47, i32 79, i32 111, i32 143, i32 175, i32 16, i32 48, i32 80, i32 112, i32 144, i32 176, i32 17, i32 49, i32 81, i32 113, i32 145, i32 177, i32 18, i32 50, i32 82, i32 114, i32 146, i32 178, i32 19, i32 51, i32 83, i32 115, i32 147, i32 179, i32 20, i32 52, i32 84, i32 116, i32 148, i32 180, i32 21, i32 53, i32 85, i32 117, i32 149, i32 181, i32 22, i32 54, i32 86, i32 118, i32 150, i32 182, i32 23, i32 55, i32 87, i32 119, i32 151, i32 183, i32 24, i32 56, i32 88, i32 120, i32 152, i32 184, i32 25, i32 57, i32 89, i32 121, i32 153, i32 185, i32 26, i32 58, i32 90, i32 122, i32 154, i32 186, i32 27, i32 59, i32 91, i32 123, i32 155, i32 187, i32 28, i32 60, i32 92, i32 124, i32 156, i32 188, i32 29, i32 61, i32 93, i32 125, i32 157, i32 189, i32 30, i32 62, i32 94, i32 126, i32 158, i32 190, i32 31, i32 63, i32 95, i32 127, i32 159, i32 191> 5868 store <192 x i32> %interleaved.vec, ptr %out.vec, align 64 5869 ret void 5870} 5871 5872define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { 5873; SSE-LABEL: store_i32_stride6_vf64: 5874; SSE: # %bb.0: 5875; SSE-NEXT: subq $1224, %rsp # imm = 0x4C8 5876; SSE-NEXT: movaps (%rdi), %xmm9 5877; SSE-NEXT: movaps 16(%rdi), %xmm10 5878; SSE-NEXT: movaps (%rsi), %xmm2 5879; SSE-NEXT: movaps 16(%rsi), %xmm0 5880; SSE-NEXT: movaps (%rdx), %xmm11 5881; SSE-NEXT: movaps 16(%rdx), %xmm12 5882; SSE-NEXT: movaps (%rcx), %xmm4 5883; SSE-NEXT: movaps 16(%rcx), %xmm1 5884; SSE-NEXT: movaps (%r8), %xmm6 5885; SSE-NEXT: movaps 16(%r8), %xmm3 5886; SSE-NEXT: movaps (%r9), %xmm7 5887; SSE-NEXT: movaps 16(%r9), %xmm5 5888; SSE-NEXT: movaps %xmm11, %xmm13 5889; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] 5890; SSE-NEXT: movaps %xmm9, %xmm8 5891; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] 5892; SSE-NEXT: movaps %xmm7, %xmm14 5893; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm6[0] 5894; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm8[2,3] 5895; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5896; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm13[0] 5897; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5898; SSE-NEXT: movaps %xmm6, %xmm8 5899; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1] 5900; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[0,2] 5901; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5902; SSE-NEXT: movaps %xmm4, %xmm8 5903; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] 5904; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] 5905; SSE-NEXT: movaps %xmm6, %xmm2 5906; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] 5907; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm9[2,3] 5908; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5909; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,0] 5910; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5911; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm7[3,3] 5912; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] 5913; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm6[0,2] 5914; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5915; SSE-NEXT: movaps %xmm12, %xmm4 5916; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 5917; SSE-NEXT: movaps %xmm10, %xmm2 5918; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 5919; SSE-NEXT: movaps %xmm5, %xmm6 5920; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] 5921; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3] 5922; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5923; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] 5924; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5925; SSE-NEXT: movaps %xmm3, %xmm2 5926; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm5[1,1] 5927; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm2[0,2] 5928; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5929; SSE-NEXT: movaps %xmm1, %xmm2 5930; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm12[1] 5931; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] 5932; SSE-NEXT: movaps %xmm3, %xmm0 5933; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] 5934; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[2,3] 5935; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5936; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,0] 5937; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5938; SSE-NEXT: movaps 32(%rdi), %xmm6 5939; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm5[3,3] 5940; SSE-NEXT: movaps 32(%rdx), %xmm5 5941; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] 5942; SSE-NEXT: movaps 32(%rcx), %xmm0 5943; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm3[0,2] 5944; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5945; SSE-NEXT: movaps %xmm5, %xmm7 5946; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] 5947; SSE-NEXT: movaps 32(%rsi), %xmm1 5948; SSE-NEXT: movaps %xmm6, %xmm4 5949; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 5950; SSE-NEXT: movaps 32(%r8), %xmm2 5951; SSE-NEXT: movaps 32(%r9), %xmm3 5952; SSE-NEXT: movaps %xmm3, %xmm8 5953; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] 5954; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] 5955; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5956; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] 5957; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5958; SSE-NEXT: movaps %xmm2, %xmm4 5959; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 5960; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[0,2] 5961; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5962; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] 5963; SSE-NEXT: movaps %xmm0, %xmm1 5964; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] 5965; SSE-NEXT: movaps %xmm2, %xmm4 5966; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 5967; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] 5968; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5969; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] 5970; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5971; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 5972; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] 5973; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm2[0,2] 5974; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5975; SSE-NEXT: movaps 48(%rdx), %xmm6 5976; SSE-NEXT: movaps 48(%rcx), %xmm0 5977; SSE-NEXT: movaps %xmm6, %xmm5 5978; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 5979; SSE-NEXT: movaps 48(%rdi), %xmm7 5980; SSE-NEXT: movaps 48(%rsi), %xmm1 5981; SSE-NEXT: movaps %xmm7, %xmm4 5982; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 5983; SSE-NEXT: movaps 48(%r8), %xmm2 5984; SSE-NEXT: movaps 48(%r9), %xmm3 5985; SSE-NEXT: movaps %xmm3, %xmm8 5986; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] 5987; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] 5988; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5989; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 5990; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5991; SSE-NEXT: movaps %xmm2, %xmm4 5992; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 5993; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] 5994; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5995; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] 5996; SSE-NEXT: movaps %xmm0, %xmm1 5997; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] 5998; SSE-NEXT: movaps %xmm2, %xmm4 5999; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 6000; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] 6001; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6002; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] 6003; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6004; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 6005; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] 6006; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] 6007; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6008; SSE-NEXT: movaps 64(%rdx), %xmm6 6009; SSE-NEXT: movaps 64(%rcx), %xmm0 6010; SSE-NEXT: movaps %xmm6, %xmm5 6011; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 6012; SSE-NEXT: movaps 64(%rdi), %xmm7 6013; SSE-NEXT: movaps 64(%rsi), %xmm1 6014; SSE-NEXT: movaps %xmm7, %xmm4 6015; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 6016; SSE-NEXT: movaps 64(%r8), %xmm2 6017; SSE-NEXT: movaps 64(%r9), %xmm3 6018; SSE-NEXT: movaps %xmm3, %xmm8 6019; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] 6020; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] 6021; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6022; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 6023; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6024; SSE-NEXT: movaps %xmm2, %xmm4 6025; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 6026; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] 6027; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6028; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] 6029; SSE-NEXT: movaps %xmm0, %xmm1 6030; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] 6031; SSE-NEXT: movaps %xmm2, %xmm4 6032; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 6033; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] 6034; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6035; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] 6036; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6037; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 6038; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] 6039; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] 6040; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6041; SSE-NEXT: movaps 80(%rdx), %xmm6 6042; SSE-NEXT: movaps 80(%rcx), %xmm0 6043; SSE-NEXT: movaps %xmm6, %xmm5 6044; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 6045; SSE-NEXT: movaps 80(%rdi), %xmm7 6046; SSE-NEXT: movaps 80(%rsi), %xmm1 6047; SSE-NEXT: movaps %xmm7, %xmm4 6048; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 6049; SSE-NEXT: movaps 80(%r8), %xmm2 6050; SSE-NEXT: movaps 80(%r9), %xmm3 6051; SSE-NEXT: movaps %xmm3, %xmm8 6052; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] 6053; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] 6054; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6055; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 6056; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6057; SSE-NEXT: movaps %xmm2, %xmm4 6058; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 6059; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] 6060; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6061; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] 6062; SSE-NEXT: movaps %xmm0, %xmm1 6063; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] 6064; SSE-NEXT: movaps %xmm2, %xmm4 6065; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 6066; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] 6067; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6068; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] 6069; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6070; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 6071; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] 6072; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] 6073; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6074; SSE-NEXT: movaps 96(%rdx), %xmm6 6075; SSE-NEXT: movaps 96(%rcx), %xmm0 6076; SSE-NEXT: movaps %xmm6, %xmm5 6077; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 6078; SSE-NEXT: movaps 96(%rdi), %xmm7 6079; SSE-NEXT: movaps 96(%rsi), %xmm1 6080; SSE-NEXT: movaps %xmm7, %xmm4 6081; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 6082; SSE-NEXT: movaps 96(%r8), %xmm2 6083; SSE-NEXT: movaps 96(%r9), %xmm3 6084; SSE-NEXT: movaps %xmm3, %xmm8 6085; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] 6086; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] 6087; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6088; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 6089; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6090; SSE-NEXT: movaps %xmm2, %xmm4 6091; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 6092; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] 6093; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6094; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] 6095; SSE-NEXT: movaps %xmm0, %xmm1 6096; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] 6097; SSE-NEXT: movaps %xmm2, %xmm4 6098; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 6099; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] 6100; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6101; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] 6102; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6103; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 6104; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] 6105; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] 6106; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6107; SSE-NEXT: movaps 112(%rdx), %xmm6 6108; SSE-NEXT: movaps 112(%rcx), %xmm0 6109; SSE-NEXT: movaps %xmm6, %xmm5 6110; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 6111; SSE-NEXT: movaps 112(%rdi), %xmm7 6112; SSE-NEXT: movaps 112(%rsi), %xmm1 6113; SSE-NEXT: movaps %xmm7, %xmm4 6114; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 6115; SSE-NEXT: movaps 112(%r8), %xmm2 6116; SSE-NEXT: movaps 112(%r9), %xmm3 6117; SSE-NEXT: movaps %xmm3, %xmm8 6118; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] 6119; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] 6120; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6121; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 6122; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6123; SSE-NEXT: movaps %xmm2, %xmm4 6124; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 6125; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] 6126; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6127; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] 6128; SSE-NEXT: movaps %xmm0, %xmm1 6129; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] 6130; SSE-NEXT: movaps %xmm2, %xmm4 6131; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 6132; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] 6133; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6134; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] 6135; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6136; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 6137; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] 6138; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] 6139; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6140; SSE-NEXT: movaps 128(%rdx), %xmm6 6141; SSE-NEXT: movaps 128(%rcx), %xmm0 6142; SSE-NEXT: movaps %xmm6, %xmm5 6143; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 6144; SSE-NEXT: movaps 128(%rdi), %xmm7 6145; SSE-NEXT: movaps 128(%rsi), %xmm1 6146; SSE-NEXT: movaps %xmm7, %xmm4 6147; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 6148; SSE-NEXT: movaps 128(%r8), %xmm2 6149; SSE-NEXT: movaps 128(%r9), %xmm3 6150; SSE-NEXT: movaps %xmm3, %xmm8 6151; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] 6152; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] 6153; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6154; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 6155; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6156; SSE-NEXT: movaps %xmm2, %xmm4 6157; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 6158; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] 6159; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6160; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] 6161; SSE-NEXT: movaps %xmm0, %xmm1 6162; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] 6163; SSE-NEXT: movaps %xmm2, %xmm4 6164; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 6165; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] 6166; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6167; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] 6168; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6169; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 6170; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] 6171; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] 6172; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6173; SSE-NEXT: movaps 144(%rdx), %xmm6 6174; SSE-NEXT: movaps 144(%rcx), %xmm0 6175; SSE-NEXT: movaps %xmm6, %xmm5 6176; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 6177; SSE-NEXT: movaps 144(%rdi), %xmm7 6178; SSE-NEXT: movaps 144(%rsi), %xmm1 6179; SSE-NEXT: movaps %xmm7, %xmm4 6180; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 6181; SSE-NEXT: movaps 144(%r8), %xmm2 6182; SSE-NEXT: movaps 144(%r9), %xmm3 6183; SSE-NEXT: movaps %xmm3, %xmm8 6184; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] 6185; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] 6186; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6187; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 6188; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6189; SSE-NEXT: movaps %xmm2, %xmm4 6190; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 6191; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] 6192; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6193; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] 6194; SSE-NEXT: movaps %xmm0, %xmm1 6195; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] 6196; SSE-NEXT: movaps %xmm2, %xmm4 6197; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 6198; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] 6199; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6200; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] 6201; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6202; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 6203; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] 6204; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] 6205; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6206; SSE-NEXT: movaps 160(%rdx), %xmm6 6207; SSE-NEXT: movaps 160(%rcx), %xmm0 6208; SSE-NEXT: movaps %xmm6, %xmm5 6209; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 6210; SSE-NEXT: movaps 160(%rdi), %xmm7 6211; SSE-NEXT: movaps 160(%rsi), %xmm1 6212; SSE-NEXT: movaps %xmm7, %xmm4 6213; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 6214; SSE-NEXT: movaps 160(%r8), %xmm2 6215; SSE-NEXT: movaps 160(%r9), %xmm3 6216; SSE-NEXT: movaps %xmm3, %xmm8 6217; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] 6218; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] 6219; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6220; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 6221; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6222; SSE-NEXT: movaps %xmm2, %xmm4 6223; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 6224; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] 6225; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6226; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] 6227; SSE-NEXT: movaps %xmm0, %xmm1 6228; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] 6229; SSE-NEXT: movaps %xmm2, %xmm4 6230; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 6231; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] 6232; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6233; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] 6234; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6235; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 6236; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] 6237; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] 6238; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6239; SSE-NEXT: movaps 176(%rdx), %xmm6 6240; SSE-NEXT: movaps 176(%rcx), %xmm0 6241; SSE-NEXT: movaps %xmm6, %xmm5 6242; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 6243; SSE-NEXT: movaps 176(%rdi), %xmm7 6244; SSE-NEXT: movaps 176(%rsi), %xmm1 6245; SSE-NEXT: movaps %xmm7, %xmm4 6246; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 6247; SSE-NEXT: movaps 176(%r8), %xmm2 6248; SSE-NEXT: movaps 176(%r9), %xmm3 6249; SSE-NEXT: movaps %xmm3, %xmm8 6250; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] 6251; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] 6252; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6253; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 6254; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6255; SSE-NEXT: movaps %xmm2, %xmm4 6256; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 6257; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] 6258; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6259; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] 6260; SSE-NEXT: movaps %xmm0, %xmm1 6261; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] 6262; SSE-NEXT: movaps %xmm2, %xmm4 6263; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 6264; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] 6265; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6266; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] 6267; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6268; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 6269; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] 6270; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] 6271; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6272; SSE-NEXT: movaps 192(%rdx), %xmm6 6273; SSE-NEXT: movaps 192(%rcx), %xmm0 6274; SSE-NEXT: movaps %xmm6, %xmm5 6275; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 6276; SSE-NEXT: movaps 192(%rdi), %xmm7 6277; SSE-NEXT: movaps 192(%rsi), %xmm1 6278; SSE-NEXT: movaps %xmm7, %xmm4 6279; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 6280; SSE-NEXT: movaps 192(%r8), %xmm2 6281; SSE-NEXT: movaps 192(%r9), %xmm3 6282; SSE-NEXT: movaps %xmm3, %xmm8 6283; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] 6284; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] 6285; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6286; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 6287; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6288; SSE-NEXT: movaps %xmm2, %xmm4 6289; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 6290; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] 6291; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6292; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] 6293; SSE-NEXT: movaps %xmm0, %xmm1 6294; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] 6295; SSE-NEXT: movaps %xmm2, %xmm4 6296; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 6297; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] 6298; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill 6299; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] 6300; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6301; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 6302; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] 6303; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] 6304; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6305; SSE-NEXT: movaps 208(%rdx), %xmm6 6306; SSE-NEXT: movaps 208(%rcx), %xmm0 6307; SSE-NEXT: movaps %xmm6, %xmm5 6308; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 6309; SSE-NEXT: movaps 208(%rdi), %xmm7 6310; SSE-NEXT: movaps 208(%rsi), %xmm1 6311; SSE-NEXT: movaps %xmm7, %xmm4 6312; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 6313; SSE-NEXT: movaps 208(%r8), %xmm2 6314; SSE-NEXT: movaps 208(%r9), %xmm3 6315; SSE-NEXT: movaps %xmm3, %xmm8 6316; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] 6317; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] 6318; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6319; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 6320; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6321; SSE-NEXT: movaps %xmm2, %xmm4 6322; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 6323; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] 6324; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6325; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] 6326; SSE-NEXT: movaps %xmm0, %xmm1 6327; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] 6328; SSE-NEXT: movaps %xmm2, %xmm4 6329; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 6330; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] 6331; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6332; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] 6333; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6334; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 6335; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] 6336; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] 6337; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6338; SSE-NEXT: movaps 224(%rdx), %xmm9 6339; SSE-NEXT: movaps 224(%rcx), %xmm0 6340; SSE-NEXT: movaps %xmm9, %xmm14 6341; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] 6342; SSE-NEXT: movaps 224(%rdi), %xmm11 6343; SSE-NEXT: movaps 224(%rsi), %xmm1 6344; SSE-NEXT: movaps %xmm11, %xmm13 6345; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] 6346; SSE-NEXT: movaps 224(%r8), %xmm2 6347; SSE-NEXT: movaps 224(%r9), %xmm3 6348; SSE-NEXT: movaps %xmm3, %xmm15 6349; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] 6350; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm13[2,3] 6351; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] 6352; SSE-NEXT: movaps %xmm2, %xmm4 6353; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] 6354; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2] 6355; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] 6356; SSE-NEXT: movaps %xmm0, %xmm1 6357; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] 6358; SSE-NEXT: movaps %xmm2, %xmm8 6359; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] 6360; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm11[2,3] 6361; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] 6362; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] 6363; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] 6364; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] 6365; SSE-NEXT: movaps 240(%rdx), %xmm3 6366; SSE-NEXT: movaps 240(%rcx), %xmm12 6367; SSE-NEXT: movaps %xmm3, %xmm5 6368; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] 6369; SSE-NEXT: movaps 240(%rdi), %xmm2 6370; SSE-NEXT: movaps 240(%rsi), %xmm10 6371; SSE-NEXT: movaps %xmm2, %xmm4 6372; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] 6373; SSE-NEXT: movaps 240(%r8), %xmm1 6374; SSE-NEXT: movaps 240(%r9), %xmm7 6375; SSE-NEXT: movaps %xmm7, %xmm6 6376; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] 6377; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] 6378; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 6379; SSE-NEXT: movaps %xmm1, %xmm0 6380; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] 6381; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] 6382; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] 6383; SSE-NEXT: movaps %xmm12, %xmm0 6384; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] 6385; SSE-NEXT: movaps %xmm1, %xmm10 6386; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] 6387; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm2[2,3] 6388; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] 6389; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] 6390; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] 6391; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] 6392; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 6393; SSE-NEXT: movaps %xmm3, 1520(%rax) 6394; SSE-NEXT: movaps %xmm10, 1504(%rax) 6395; SSE-NEXT: movaps %xmm2, 1488(%rax) 6396; SSE-NEXT: movaps %xmm5, 1472(%rax) 6397; SSE-NEXT: movaps %xmm6, 1456(%rax) 6398; SSE-NEXT: movaps %xmm4, 1440(%rax) 6399; SSE-NEXT: movaps %xmm9, 1424(%rax) 6400; SSE-NEXT: movaps %xmm8, 1408(%rax) 6401; SSE-NEXT: movaps %xmm11, 1392(%rax) 6402; SSE-NEXT: movaps %xmm14, 1376(%rax) 6403; SSE-NEXT: movaps %xmm15, 1360(%rax) 6404; SSE-NEXT: movaps %xmm13, 1344(%rax) 6405; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6406; SSE-NEXT: movaps %xmm0, 1328(%rax) 6407; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6408; SSE-NEXT: movaps %xmm0, 1312(%rax) 6409; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6410; SSE-NEXT: movaps %xmm0, 1296(%rax) 6411; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6412; SSE-NEXT: movaps %xmm0, 1280(%rax) 6413; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6414; SSE-NEXT: movaps %xmm0, 1264(%rax) 6415; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6416; SSE-NEXT: movaps %xmm0, 1248(%rax) 6417; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6418; SSE-NEXT: movaps %xmm0, 1232(%rax) 6419; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 6420; SSE-NEXT: movaps %xmm0, 1216(%rax) 6421; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6422; SSE-NEXT: movaps %xmm0, 1200(%rax) 6423; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6424; SSE-NEXT: movaps %xmm0, 1184(%rax) 6425; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6426; SSE-NEXT: movaps %xmm0, 1168(%rax) 6427; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6428; SSE-NEXT: movaps %xmm0, 1152(%rax) 6429; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6430; SSE-NEXT: movaps %xmm0, 1136(%rax) 6431; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6432; SSE-NEXT: movaps %xmm0, 1120(%rax) 6433; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6434; SSE-NEXT: movaps %xmm0, 1104(%rax) 6435; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6436; SSE-NEXT: movaps %xmm0, 1088(%rax) 6437; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6438; SSE-NEXT: movaps %xmm0, 1072(%rax) 6439; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6440; SSE-NEXT: movaps %xmm0, 1056(%rax) 6441; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6442; SSE-NEXT: movaps %xmm0, 1040(%rax) 6443; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6444; SSE-NEXT: movaps %xmm0, 1024(%rax) 6445; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6446; SSE-NEXT: movaps %xmm0, 1008(%rax) 6447; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6448; SSE-NEXT: movaps %xmm0, 992(%rax) 6449; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6450; SSE-NEXT: movaps %xmm0, 976(%rax) 6451; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6452; SSE-NEXT: movaps %xmm0, 960(%rax) 6453; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6454; SSE-NEXT: movaps %xmm0, 944(%rax) 6455; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6456; SSE-NEXT: movaps %xmm0, 928(%rax) 6457; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6458; SSE-NEXT: movaps %xmm0, 912(%rax) 6459; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6460; SSE-NEXT: movaps %xmm0, 896(%rax) 6461; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6462; SSE-NEXT: movaps %xmm0, 880(%rax) 6463; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6464; SSE-NEXT: movaps %xmm0, 864(%rax) 6465; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6466; SSE-NEXT: movaps %xmm0, 848(%rax) 6467; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6468; SSE-NEXT: movaps %xmm0, 832(%rax) 6469; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6470; SSE-NEXT: movaps %xmm0, 816(%rax) 6471; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6472; SSE-NEXT: movaps %xmm0, 800(%rax) 6473; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6474; SSE-NEXT: movaps %xmm0, 784(%rax) 6475; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6476; SSE-NEXT: movaps %xmm0, 768(%rax) 6477; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6478; SSE-NEXT: movaps %xmm0, 752(%rax) 6479; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6480; SSE-NEXT: movaps %xmm0, 736(%rax) 6481; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6482; SSE-NEXT: movaps %xmm0, 720(%rax) 6483; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6484; SSE-NEXT: movaps %xmm0, 704(%rax) 6485; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6486; SSE-NEXT: movaps %xmm0, 688(%rax) 6487; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6488; SSE-NEXT: movaps %xmm0, 672(%rax) 6489; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6490; SSE-NEXT: movaps %xmm0, 656(%rax) 6491; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6492; SSE-NEXT: movaps %xmm0, 640(%rax) 6493; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6494; SSE-NEXT: movaps %xmm0, 624(%rax) 6495; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6496; SSE-NEXT: movaps %xmm0, 608(%rax) 6497; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6498; SSE-NEXT: movaps %xmm0, 592(%rax) 6499; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6500; SSE-NEXT: movaps %xmm0, 576(%rax) 6501; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6502; SSE-NEXT: movaps %xmm0, 560(%rax) 6503; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6504; SSE-NEXT: movaps %xmm0, 544(%rax) 6505; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6506; SSE-NEXT: movaps %xmm0, 528(%rax) 6507; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6508; SSE-NEXT: movaps %xmm0, 512(%rax) 6509; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6510; SSE-NEXT: movaps %xmm0, 496(%rax) 6511; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6512; SSE-NEXT: movaps %xmm0, 480(%rax) 6513; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6514; SSE-NEXT: movaps %xmm0, 464(%rax) 6515; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6516; SSE-NEXT: movaps %xmm0, 448(%rax) 6517; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6518; SSE-NEXT: movaps %xmm0, 432(%rax) 6519; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6520; SSE-NEXT: movaps %xmm0, 416(%rax) 6521; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6522; SSE-NEXT: movaps %xmm0, 400(%rax) 6523; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6524; SSE-NEXT: movaps %xmm0, 384(%rax) 6525; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6526; SSE-NEXT: movaps %xmm0, 368(%rax) 6527; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6528; SSE-NEXT: movaps %xmm0, 352(%rax) 6529; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6530; SSE-NEXT: movaps %xmm0, 336(%rax) 6531; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6532; SSE-NEXT: movaps %xmm0, 320(%rax) 6533; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6534; SSE-NEXT: movaps %xmm0, 304(%rax) 6535; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6536; SSE-NEXT: movaps %xmm0, 288(%rax) 6537; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6538; SSE-NEXT: movaps %xmm0, 272(%rax) 6539; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6540; SSE-NEXT: movaps %xmm0, 256(%rax) 6541; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6542; SSE-NEXT: movaps %xmm0, 240(%rax) 6543; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6544; SSE-NEXT: movaps %xmm0, 224(%rax) 6545; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6546; SSE-NEXT: movaps %xmm0, 208(%rax) 6547; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6548; SSE-NEXT: movaps %xmm0, 192(%rax) 6549; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6550; SSE-NEXT: movaps %xmm0, 176(%rax) 6551; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6552; SSE-NEXT: movaps %xmm0, 160(%rax) 6553; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6554; SSE-NEXT: movaps %xmm0, 144(%rax) 6555; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6556; SSE-NEXT: movaps %xmm0, 128(%rax) 6557; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6558; SSE-NEXT: movaps %xmm0, 112(%rax) 6559; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6560; SSE-NEXT: movaps %xmm0, 96(%rax) 6561; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6562; SSE-NEXT: movaps %xmm0, 80(%rax) 6563; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6564; SSE-NEXT: movaps %xmm0, 64(%rax) 6565; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6566; SSE-NEXT: movaps %xmm0, 48(%rax) 6567; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6568; SSE-NEXT: movaps %xmm0, 32(%rax) 6569; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6570; SSE-NEXT: movaps %xmm0, 16(%rax) 6571; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6572; SSE-NEXT: movaps %xmm0, (%rax) 6573; SSE-NEXT: addq $1224, %rsp # imm = 0x4C8 6574; SSE-NEXT: retq 6575; 6576; AVX-LABEL: store_i32_stride6_vf64: 6577; AVX: # %bb.0: 6578; AVX-NEXT: subq $2504, %rsp # imm = 0x9C8 6579; AVX-NEXT: vmovaps (%rdi), %ymm8 6580; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6581; AVX-NEXT: vmovaps (%rsi), %ymm9 6582; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6583; AVX-NEXT: vmovaps (%rdx), %ymm4 6584; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6585; AVX-NEXT: vmovaps (%rcx), %ymm15 6586; AVX-NEXT: vmovaps (%r8), %ymm6 6587; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6588; AVX-NEXT: vmovaps (%rcx), %xmm1 6589; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6590; AVX-NEXT: vmovaps 32(%rcx), %xmm2 6591; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6592; AVX-NEXT: vmovaps (%rdx), %xmm0 6593; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6594; AVX-NEXT: vmovaps 32(%rdx), %xmm3 6595; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6596; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] 6597; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 6598; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6599; AVX-NEXT: vmovaps (%rsi), %xmm1 6600; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6601; AVX-NEXT: vmovaps (%rdi), %xmm7 6602; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6603; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] 6604; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6605; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6606; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 6607; AVX-NEXT: vbroadcastss 4(%r8), %xmm1 6608; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 6609; AVX-NEXT: vbroadcastss 4(%r9), %ymm1 6610; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 6611; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6612; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] 6613; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 6614; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm4[0],ymm15[2],ymm4[2] 6615; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6616; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 6617; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] 6618; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 6619; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] 6620; AVX-NEXT: vbroadcastss 16(%r9), %ymm1 6621; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 6622; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6623; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,2],xmm2[1,2] 6624; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 6625; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6626; AVX-NEXT: vmovaps 32(%rsi), %xmm1 6627; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6628; AVX-NEXT: vmovaps 32(%rdi), %xmm2 6629; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6630; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 6631; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6632; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6633; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 6634; AVX-NEXT: vbroadcastss 36(%r8), %xmm1 6635; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 6636; AVX-NEXT: vbroadcastss 36(%r9), %ymm1 6637; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 6638; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6639; AVX-NEXT: vmovaps 32(%rdi), %ymm1 6640; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6641; AVX-NEXT: vmovaps 32(%rsi), %ymm0 6642; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6643; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 6644; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 6645; AVX-NEXT: vmovaps 32(%rdx), %ymm2 6646; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6647; AVX-NEXT: vmovaps 32(%rcx), %ymm1 6648; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6649; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] 6650; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 6651; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] 6652; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 6653; AVX-NEXT: vmovaps 32(%r8), %ymm1 6654; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6655; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 6656; AVX-NEXT: vbroadcastss 48(%r9), %ymm1 6657; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 6658; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6659; AVX-NEXT: vmovaps 64(%rcx), %xmm1 6660; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6661; AVX-NEXT: vmovaps 64(%rdx), %xmm0 6662; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6663; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] 6664; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 6665; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6666; AVX-NEXT: vmovaps 64(%rsi), %xmm1 6667; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6668; AVX-NEXT: vmovaps 64(%rdi), %xmm2 6669; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6670; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 6671; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6672; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6673; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 6674; AVX-NEXT: vbroadcastss 68(%r8), %xmm1 6675; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 6676; AVX-NEXT: vbroadcastss 68(%r9), %ymm1 6677; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 6678; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6679; AVX-NEXT: vmovaps 64(%rdi), %ymm1 6680; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6681; AVX-NEXT: vmovaps 64(%rsi), %ymm0 6682; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6683; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 6684; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 6685; AVX-NEXT: vmovaps 64(%rdx), %ymm2 6686; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6687; AVX-NEXT: vmovaps 64(%rcx), %ymm1 6688; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6689; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] 6690; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 6691; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] 6692; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 6693; AVX-NEXT: vmovaps 64(%r8), %ymm1 6694; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6695; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 6696; AVX-NEXT: vbroadcastss 80(%r9), %ymm1 6697; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 6698; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6699; AVX-NEXT: vmovaps 96(%rcx), %xmm1 6700; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6701; AVX-NEXT: vmovaps 96(%rdx), %xmm0 6702; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6703; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] 6704; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 6705; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6706; AVX-NEXT: vmovaps 96(%rsi), %xmm1 6707; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6708; AVX-NEXT: vmovaps 96(%rdi), %xmm2 6709; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6710; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 6711; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6712; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6713; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 6714; AVX-NEXT: vbroadcastss 100(%r8), %xmm1 6715; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 6716; AVX-NEXT: vbroadcastss 100(%r9), %ymm1 6717; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 6718; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6719; AVX-NEXT: vmovaps 96(%rdi), %ymm1 6720; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6721; AVX-NEXT: vmovaps 96(%rsi), %ymm0 6722; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6723; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 6724; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 6725; AVX-NEXT: vmovaps 96(%rdx), %ymm2 6726; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6727; AVX-NEXT: vmovaps 96(%rcx), %ymm1 6728; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6729; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] 6730; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 6731; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] 6732; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 6733; AVX-NEXT: vmovaps 96(%r8), %ymm1 6734; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6735; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 6736; AVX-NEXT: vbroadcastss 112(%r9), %ymm1 6737; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 6738; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6739; AVX-NEXT: vmovaps 128(%rcx), %xmm1 6740; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6741; AVX-NEXT: vmovaps 128(%rdx), %xmm0 6742; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6743; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] 6744; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 6745; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6746; AVX-NEXT: vmovaps 128(%rsi), %xmm1 6747; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6748; AVX-NEXT: vmovaps 128(%rdi), %xmm2 6749; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6750; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 6751; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6752; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6753; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 6754; AVX-NEXT: vbroadcastss 132(%r8), %xmm1 6755; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 6756; AVX-NEXT: vbroadcastss 132(%r9), %ymm1 6757; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 6758; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6759; AVX-NEXT: vmovaps 128(%rdi), %ymm0 6760; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6761; AVX-NEXT: vmovaps 128(%rsi), %ymm11 6762; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] 6763; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 6764; AVX-NEXT: vmovaps 128(%rdx), %ymm1 6765; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6766; AVX-NEXT: vmovaps 128(%rcx), %ymm2 6767; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6768; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] 6769; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 6770; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] 6771; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 6772; AVX-NEXT: vmovaps 128(%r8), %ymm1 6773; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6774; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 6775; AVX-NEXT: vbroadcastss 144(%r9), %ymm1 6776; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 6777; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6778; AVX-NEXT: vmovaps 160(%rcx), %xmm0 6779; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6780; AVX-NEXT: vmovaps 160(%rdx), %xmm1 6781; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6782; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] 6783; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 6784; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6785; AVX-NEXT: vmovaps 160(%rsi), %xmm7 6786; AVX-NEXT: vmovaps 160(%rdi), %xmm6 6787; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] 6788; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6789; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6790; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 6791; AVX-NEXT: vbroadcastss 164(%r8), %xmm1 6792; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 6793; AVX-NEXT: vbroadcastss 164(%r9), %ymm1 6794; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 6795; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6796; AVX-NEXT: vmovaps 160(%rdi), %ymm1 6797; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6798; AVX-NEXT: vmovaps 160(%rsi), %ymm0 6799; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6800; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 6801; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 6802; AVX-NEXT: vmovaps 160(%rdx), %ymm1 6803; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6804; AVX-NEXT: vmovaps 160(%rcx), %ymm8 6805; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] 6806; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6807; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 6808; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] 6809; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 6810; AVX-NEXT: vmovaps 160(%r8), %ymm1 6811; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6812; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 6813; AVX-NEXT: vbroadcastss 176(%r9), %ymm1 6814; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 6815; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6816; AVX-NEXT: vmovaps 192(%rcx), %xmm1 6817; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6818; AVX-NEXT: vmovaps 192(%rdx), %xmm0 6819; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6820; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] 6821; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 6822; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6823; AVX-NEXT: vmovaps 192(%rsi), %xmm3 6824; AVX-NEXT: vmovaps 192(%rdi), %xmm2 6825; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] 6826; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6827; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6828; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 6829; AVX-NEXT: vbroadcastss 196(%r8), %xmm1 6830; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 6831; AVX-NEXT: vbroadcastss 196(%r9), %ymm1 6832; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 6833; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6834; AVX-NEXT: vmovaps 192(%rdi), %ymm1 6835; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6836; AVX-NEXT: vmovaps 192(%rsi), %ymm0 6837; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6838; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] 6839; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 6840; AVX-NEXT: vmovaps 192(%rdx), %ymm4 6841; AVX-NEXT: vmovaps 192(%rcx), %ymm1 6842; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6843; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] 6844; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6845; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 6846; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] 6847; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 6848; AVX-NEXT: vmovaps 192(%r8), %ymm1 6849; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6850; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 6851; AVX-NEXT: vbroadcastss 208(%r9), %ymm1 6852; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 6853; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6854; AVX-NEXT: vmovaps 224(%rcx), %xmm1 6855; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6856; AVX-NEXT: vmovaps 224(%rdx), %xmm0 6857; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6858; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] 6859; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 6860; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6861; AVX-NEXT: vmovaps 224(%rsi), %xmm1 6862; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6863; AVX-NEXT: vmovaps 224(%rdi), %xmm5 6864; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6865; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] 6866; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6867; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6868; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 6869; AVX-NEXT: vbroadcastss 228(%r8), %xmm1 6870; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 6871; AVX-NEXT: vbroadcastss 228(%r9), %ymm1 6872; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 6873; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6874; AVX-NEXT: vmovaps 224(%rdi), %ymm9 6875; AVX-NEXT: vmovaps 224(%rsi), %ymm0 6876; AVX-NEXT: vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] 6877; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] 6878; AVX-NEXT: vmovaps 224(%rdx), %ymm5 6879; AVX-NEXT: vmovaps 224(%rcx), %ymm1 6880; AVX-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] 6881; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6882; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6883; AVX-NEXT: vextractf128 $1, %ymm13, %xmm13 6884; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] 6885; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] 6886; AVX-NEXT: vmovaps 224(%r8), %ymm10 6887; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6888; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7] 6889; AVX-NEXT: vbroadcastss 240(%r9), %ymm14 6890; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] 6891; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6892; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 6893; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload 6894; AVX-NEXT: # ymm14 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] 6895; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 6896; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm10[1,2],ymm15[1,2],ymm10[5,6],ymm15[5,6] 6897; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] 6898; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,1,3,4,6,5,7] 6899; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] 6900; AVX-NEXT: vbroadcastss 20(%r8), %xmm15 6901; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] 6902; AVX-NEXT: vbroadcastss 20(%r9), %ymm15 6903; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] 6904; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6905; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 6906; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload 6907; AVX-NEXT: # ymm13 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] 6908; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6909; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 6910; AVX-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm15 # 32-byte Folded Reload 6911; AVX-NEXT: # ymm15 = ymm12[1,2],mem[1,2],ymm12[5,6],mem[5,6] 6912; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3] 6913; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,1,3,4,6,5,7] 6914; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm13[4,5],ymm15[6,7] 6915; AVX-NEXT: vbroadcastss 52(%r8), %xmm12 6916; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5,6,7] 6917; AVX-NEXT: vbroadcastss 52(%r9), %ymm15 6918; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] 6919; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6920; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 6921; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload 6922; AVX-NEXT: # ymm13 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] 6923; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6924; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 6925; AVX-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload 6926; AVX-NEXT: # ymm12 = ymm12[1,2],mem[1,2],ymm12[5,6],mem[5,6] 6927; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] 6928; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,1,3,4,6,5,7] 6929; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] 6930; AVX-NEXT: vbroadcastss 84(%r8), %xmm15 6931; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] 6932; AVX-NEXT: vbroadcastss 84(%r9), %ymm15 6933; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] 6934; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6935; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 6936; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload 6937; AVX-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1] 6938; AVX-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 6939; AVX-NEXT: # xmm15 = mem[0,0,0,0] 6940; AVX-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 6941; AVX-NEXT: # xmm13 = mem[0,0,0,0] 6942; AVX-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] 6943; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm15 6944; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] 6945; AVX-NEXT: vinsertf128 $1, 96(%r8), %ymm12, %ymm12 6946; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] 6947; AVX-NEXT: vbroadcastss 96(%r9), %ymm13 6948; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] 6949; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6950; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 6951; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload 6952; AVX-NEXT: # ymm12 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] 6953; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 6954; AVX-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload 6955; AVX-NEXT: # ymm13 = ymm13[1,2],mem[1,2],ymm13[5,6],mem[5,6] 6956; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] 6957; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,1,3,4,6,5,7] 6958; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm12[4,5],ymm13[6,7] 6959; AVX-NEXT: vbroadcastss 116(%r8), %xmm15 6960; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] 6961; AVX-NEXT: vbroadcastss 116(%r9), %ymm15 6962; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] 6963; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6964; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 6965; AVX-NEXT: vunpckhps {{.*#+}} ymm13 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] 6966; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6967; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 6968; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 6969; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,2],ymm15[1,2],ymm11[5,6],ymm15[5,6] 6970; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] 6971; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,1,3,4,6,5,7] 6972; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] 6973; AVX-NEXT: vbroadcastss 148(%r8), %xmm13 6974; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] 6975; AVX-NEXT: vbroadcastss 148(%r9), %ymm13 6976; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7] 6977; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6978; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 6979; AVX-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 6980; AVX-NEXT: # xmm7 = mem[0,0,0,0] 6981; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 6982; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm13[0,0,0,0] 6983; AVX-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] 6984; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm11 6985; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5,6,7] 6986; AVX-NEXT: vinsertf128 $1, 160(%r8), %ymm6, %ymm6 6987; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] 6988; AVX-NEXT: vbroadcastss 160(%r9), %ymm7 6989; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] 6990; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6991; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6992; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload 6993; AVX-NEXT: # ymm7 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] 6994; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6995; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm8[1,2],ymm6[5,6],ymm8[5,6] 6996; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] 6997; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,1,3,4,6,5,7] 6998; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] 6999; AVX-NEXT: vbroadcastss 180(%r8), %xmm8 7000; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7] 7001; AVX-NEXT: vbroadcastss 180(%r9), %ymm8 7002; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7] 7003; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7004; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 7005; AVX-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7006; AVX-NEXT: # xmm3 = mem[0,0,0,0] 7007; AVX-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 7008; AVX-NEXT: # xmm6 = mem[0,0,0,0] 7009; AVX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] 7010; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6 7011; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] 7012; AVX-NEXT: vinsertf128 $1, 192(%r8), %ymm2, %ymm2 7013; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] 7014; AVX-NEXT: vbroadcastss 192(%r9), %ymm3 7015; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] 7016; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7017; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7018; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload 7019; AVX-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] 7020; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7021; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,2],ymm2[1,2],ymm4[5,6],ymm2[5,6] 7022; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] 7023; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,1,3,4,6,5,7] 7024; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] 7025; AVX-NEXT: vbroadcastss 212(%r8), %xmm4 7026; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] 7027; AVX-NEXT: vbroadcastss 212(%r9), %ymm4 7028; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] 7029; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7030; AVX-NEXT: vunpckhps {{.*#+}} ymm3 = ymm9[2],ymm0[2],ymm9[3],ymm0[3],ymm9[6],ymm0[6],ymm9[7],ymm0[7] 7031; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,2],ymm1[1,2],ymm5[5,6],ymm1[5,6] 7032; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 7033; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] 7034; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] 7035; AVX-NEXT: vmovaps %ymm3, %ymm5 7036; AVX-NEXT: vbroadcastss 244(%r8), %xmm1 7037; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 7038; AVX-NEXT: vbroadcastss 244(%r9), %ymm1 7039; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 7040; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7041; AVX-NEXT: vbroadcastss (%rcx), %xmm0 7042; AVX-NEXT: vbroadcastss (%rdx), %xmm1 7043; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7044; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7045; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 7046; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 7047; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 7048; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] 7049; AVX-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 7050; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] 7051; AVX-NEXT: vbroadcastss (%r9), %ymm1 7052; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 7053; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7054; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7055; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 7056; AVX-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 7057; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 7058; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7059; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 7060; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,1,3,3] 7061; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 7062; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 7063; AVX-NEXT: vmovaps (%r9), %xmm1 7064; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2,2,3] 7065; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 7066; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 7067; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7068; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7069; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm10[3,0],ymm0[7,4],ymm10[7,4] 7070; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 7071; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] 7072; AVX-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 7073; AVX-NEXT: # ymm1 = mem[2,3,2,3] 7074; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] 7075; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 7076; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] 7077; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 7078; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 7079; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7080; AVX-NEXT: vbroadcastss 32(%rcx), %xmm0 7081; AVX-NEXT: vbroadcastss 32(%rdx), %xmm1 7082; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7083; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7084; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 7085; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 7086; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 7087; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] 7088; AVX-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 7089; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] 7090; AVX-NEXT: vbroadcastss 32(%r9), %ymm1 7091; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 7092; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7093; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7094; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 7095; AVX-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 7096; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 7097; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7098; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 7099; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,1,3,3] 7100; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 7101; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 7102; AVX-NEXT: vmovaps 32(%r9), %xmm1 7103; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2,2,3] 7104; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 7105; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 7106; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7107; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7108; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7109; AVX-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] 7110; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] 7111; AVX-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7112; AVX-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 7113; AVX-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 7114; AVX-NEXT: # ymm1 = mem[2,3,2,3] 7115; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] 7116; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 7117; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] 7118; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 7119; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 7120; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7121; AVX-NEXT: vbroadcastss 64(%rcx), %xmm0 7122; AVX-NEXT: vbroadcastss 64(%rdx), %xmm1 7123; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 7124; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7125; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 7126; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 7127; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 7128; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] 7129; AVX-NEXT: vinsertf128 $1, 64(%r8), %ymm1, %ymm1 7130; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] 7131; AVX-NEXT: vbroadcastss 64(%r9), %ymm1 7132; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 7133; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7134; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7135; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 7136; AVX-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] 7137; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] 7138; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7139; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 7140; AVX-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,1,3,3] 7141; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 7142; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] 7143; AVX-NEXT: vmovaps 64(%r9), %xmm3 7144; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm3[0,2,2,3] 7145; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 7146; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] 7147; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7148; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7149; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 7150; AVX-NEXT: # ymm3 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] 7151; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] 7152; AVX-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 7153; AVX-NEXT: # ymm3 = mem[2,3],ymm3[2,3] 7154; AVX-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 7155; AVX-NEXT: # ymm4 = mem[2,3,2,3] 7156; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] 7157; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] 7158; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3] 7159; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] 7160; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] 7161; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7162; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload 7163; AVX-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] 7164; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] 7165; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7166; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 7167; AVX-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,1,3,3] 7168; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 7169; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] 7170; AVX-NEXT: vmovaps 96(%r9), %xmm4 7171; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm4[0,2,2,3] 7172; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 7173; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] 7174; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7175; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 7176; AVX-NEXT: # ymm4 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] 7177; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] 7178; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] 7179; AVX-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload 7180; AVX-NEXT: # ymm8 = mem[2,3,2,3] 7181; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] 7182; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5],ymm8[6,7] 7183; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] 7184; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] 7185; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[2,3,4,5,6],ymm8[7] 7186; AVX-NEXT: vbroadcastss 128(%rcx), %xmm8 7187; AVX-NEXT: vbroadcastss 128(%rdx), %xmm9 7188; AVX-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] 7189; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7190; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload 7191; AVX-NEXT: # xmm9 = xmm0[0],mem[0],xmm0[1],mem[1] 7192; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm11 7193; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6,7] 7194; AVX-NEXT: vinsertf128 $1, 128(%r8), %ymm9, %ymm9 7195; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] 7196; AVX-NEXT: vbroadcastss 128(%r9), %ymm9 7197; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] 7198; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7199; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload 7200; AVX-NEXT: # xmm9 = xmm0[2],mem[2],xmm0[3],mem[3] 7201; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] 7202; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7203; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 7204; AVX-NEXT: vpermilps {{.*#+}} xmm11 = mem[2,1,3,3] 7205; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm11, %ymm11 7206; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5],ymm11[6,7] 7207; AVX-NEXT: vmovaps 128(%r9), %xmm11 7208; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm11[0,2,2,3] 7209; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 7210; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5,6],ymm11[7] 7211; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload 7212; AVX-NEXT: # ymm11 = ymm15[3,0],mem[3,0],ymm15[7,4],mem[7,4] 7213; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] 7214; AVX-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload 7215; AVX-NEXT: # ymm10 = mem[2,3],ymm11[2,3] 7216; AVX-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload 7217; AVX-NEXT: # ymm11 = mem[2,3,2,3] 7218; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,1,3,3,6,5,7,7] 7219; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5],ymm11[6,7] 7220; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3,2,3] 7221; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,2,3,4,6,6,7] 7222; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5,6],ymm11[7] 7223; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload 7224; AVX-NEXT: # xmm11 = xmm13[2],mem[2],xmm13[3],mem[3] 7225; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3] 7226; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7227; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 7228; AVX-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,1,3,3] 7229; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 7230; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5],ymm12[6,7] 7231; AVX-NEXT: vmovaps 160(%r9), %xmm12 7232; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,2,2,3] 7233; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 7234; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6],ymm12[7] 7235; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7236; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload 7237; AVX-NEXT: # ymm12 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] 7238; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] 7239; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm12[2,3] 7240; AVX-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload 7241; AVX-NEXT: # ymm12 = mem[2,3,2,3] 7242; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] 7243; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3,4,5],ymm12[6,7] 7244; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3] 7245; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] 7246; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4,5,6],ymm12[7] 7247; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7248; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload 7249; AVX-NEXT: # xmm12 = xmm0[2],mem[2],xmm0[3],mem[3] 7250; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3] 7251; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7252; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 7253; AVX-NEXT: vpermilps {{.*#+}} xmm13 = mem[2,1,3,3] 7254; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13 7255; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] 7256; AVX-NEXT: vmovaps 192(%r9), %xmm13 7257; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm13[0,2,2,3] 7258; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 7259; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] 7260; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload 7261; AVX-NEXT: # ymm13 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] 7262; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] 7263; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm6[2,3],ymm13[2,3] 7264; AVX-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload 7265; AVX-NEXT: # ymm13 = mem[2,3,2,3] 7266; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] 7267; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3,4,5],ymm13[6,7] 7268; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3] 7269; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] 7270; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4,5,6],ymm13[7] 7271; AVX-NEXT: vbroadcastss 224(%rcx), %xmm13 7272; AVX-NEXT: vbroadcastss 224(%rdx), %xmm14 7273; AVX-NEXT: vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] 7274; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7275; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload 7276; AVX-NEXT: # xmm14 = xmm0[0],mem[0],xmm0[1],mem[1] 7277; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm15 7278; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] 7279; AVX-NEXT: vinsertf128 $1, 224(%r8), %ymm14, %ymm14 7280; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] 7281; AVX-NEXT: vbroadcastss 224(%r9), %ymm14 7282; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] 7283; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7284; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload 7285; AVX-NEXT: # xmm14 = xmm0[2],mem[2],xmm0[3],mem[3] 7286; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] 7287; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7288; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 7289; AVX-NEXT: vpermilps {{.*#+}} xmm15 = mem[2,1,3,3] 7290; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm15, %ymm15 7291; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] 7292; AVX-NEXT: vmovaps 224(%r9), %xmm15 7293; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,2,2,3] 7294; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 7295; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2,3,4,5,6],ymm0[7] 7296; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 7297; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload 7298; AVX-NEXT: # ymm14 = ymm6[3,0],mem[3,0],ymm6[7,4],mem[7,4] 7299; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] 7300; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3] 7301; AVX-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 7302; AVX-NEXT: # ymm14 = mem[2,3,2,3] 7303; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] 7304; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm5[2,3,4,5],ymm14[6,7] 7305; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3] 7306; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] 7307; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4,5,6],ymm14[7] 7308; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 7309; AVX-NEXT: vmovaps %ymm5, 1504(%rax) 7310; AVX-NEXT: vmovaps %ymm0, 1408(%rax) 7311; AVX-NEXT: vmovaps %ymm13, 1344(%rax) 7312; AVX-NEXT: vmovaps %ymm2, 1312(%rax) 7313; AVX-NEXT: vmovaps %ymm12, 1216(%rax) 7314; AVX-NEXT: vmovaps %ymm7, 1120(%rax) 7315; AVX-NEXT: vmovaps %ymm11, 1024(%rax) 7316; AVX-NEXT: vmovaps %ymm10, 928(%rax) 7317; AVX-NEXT: vmovaps %ymm9, 832(%rax) 7318; AVX-NEXT: vmovaps %ymm8, 768(%rax) 7319; AVX-NEXT: vmovaps %ymm4, 736(%rax) 7320; AVX-NEXT: vmovaps %ymm3, 640(%rax) 7321; AVX-NEXT: vmovaps %ymm1, 544(%rax) 7322; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7323; AVX-NEXT: vmovaps %ymm0, 448(%rax) 7324; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7325; AVX-NEXT: vmovaps %ymm0, 384(%rax) 7326; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7327; AVX-NEXT: vmovaps %ymm0, 352(%rax) 7328; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7329; AVX-NEXT: vmovaps %ymm0, 256(%rax) 7330; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7331; AVX-NEXT: vmovaps %ymm0, 192(%rax) 7332; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7333; AVX-NEXT: vmovaps %ymm0, 160(%rax) 7334; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7335; AVX-NEXT: vmovaps %ymm0, 64(%rax) 7336; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7337; AVX-NEXT: vmovaps %ymm0, (%rax) 7338; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7339; AVX-NEXT: vmovaps %ymm0, 1472(%rax) 7340; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7341; AVX-NEXT: vmovaps %ymm0, 1280(%rax) 7342; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7343; AVX-NEXT: vmovaps %ymm0, 1152(%rax) 7344; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7345; AVX-NEXT: vmovaps %ymm0, 1088(%rax) 7346; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7347; AVX-NEXT: vmovaps %ymm0, 960(%rax) 7348; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7349; AVX-NEXT: vmovaps %ymm0, 896(%rax) 7350; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7351; AVX-NEXT: vmovaps %ymm0, 704(%rax) 7352; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7353; AVX-NEXT: vmovaps %ymm0, 576(%rax) 7354; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7355; AVX-NEXT: vmovaps %ymm0, 512(%rax) 7356; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7357; AVX-NEXT: vmovaps %ymm0, 320(%rax) 7358; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7359; AVX-NEXT: vmovaps %ymm0, 128(%rax) 7360; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7361; AVX-NEXT: vmovaps %ymm0, 1440(%rax) 7362; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7363; AVX-NEXT: vmovaps %ymm0, 1376(%rax) 7364; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7365; AVX-NEXT: vmovaps %ymm0, 1248(%rax) 7366; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7367; AVX-NEXT: vmovaps %ymm0, 1184(%rax) 7368; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7369; AVX-NEXT: vmovaps %ymm0, 1056(%rax) 7370; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7371; AVX-NEXT: vmovaps %ymm0, 992(%rax) 7372; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7373; AVX-NEXT: vmovaps %ymm0, 864(%rax) 7374; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7375; AVX-NEXT: vmovaps %ymm0, 800(%rax) 7376; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7377; AVX-NEXT: vmovaps %ymm0, 672(%rax) 7378; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7379; AVX-NEXT: vmovaps %ymm0, 608(%rax) 7380; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7381; AVX-NEXT: vmovaps %ymm0, 480(%rax) 7382; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7383; AVX-NEXT: vmovaps %ymm0, 416(%rax) 7384; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7385; AVX-NEXT: vmovaps %ymm0, 288(%rax) 7386; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7387; AVX-NEXT: vmovaps %ymm0, 224(%rax) 7388; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7389; AVX-NEXT: vmovaps %ymm0, 96(%rax) 7390; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7391; AVX-NEXT: vmovaps %ymm0, 32(%rax) 7392; AVX-NEXT: addq $2504, %rsp # imm = 0x9C8 7393; AVX-NEXT: vzeroupper 7394; AVX-NEXT: retq 7395; 7396; AVX2-LABEL: store_i32_stride6_vf64: 7397; AVX2: # %bb.0: 7398; AVX2-NEXT: subq $2504, %rsp # imm = 0x9C8 7399; AVX2-NEXT: vmovdqa (%rsi), %xmm0 7400; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7401; AVX2-NEXT: vmovdqa 32(%rsi), %xmm1 7402; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7403; AVX2-NEXT: vmovdqa (%rdi), %xmm3 7404; AVX2-NEXT: vmovdqa 32(%rdi), %xmm10 7405; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7406; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] 7407; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7408; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 7409; AVX2-NEXT: vmovdqa (%rcx), %xmm0 7410; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7411; AVX2-NEXT: vmovdqa 32(%rcx), %xmm8 7412; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7413; AVX2-NEXT: vmovdqa 64(%rcx), %xmm7 7414; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7415; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] 7416; AVX2-NEXT: vmovdqa (%rdx), %xmm0 7417; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7418; AVX2-NEXT: vmovdqa 32(%rdx), %xmm9 7419; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7420; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3] 7421; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 7422; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] 7423; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] 7424; AVX2-NEXT: vmovdqa (%r8), %xmm2 7425; AVX2-NEXT: vmovdqa 32(%r8), %xmm6 7426; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7427; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero 7428; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7429; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] 7430; AVX2-NEXT: vpbroadcastd 4(%r9), %ymm5 7431; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] 7432; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7433; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] 7434; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7435; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3] 7436; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[1,2,2,3] 7437; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 7438; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 7439; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] 7440; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] 7441; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero 7442; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] 7443; AVX2-NEXT: vpbroadcastd 36(%r9), %ymm5 7444; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] 7445; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7446; AVX2-NEXT: vmovdqa 64(%rdx), %xmm0 7447; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7448; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] 7449; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] 7450; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 7451; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,2,1] 7452; AVX2-NEXT: vmovdqa 64(%rsi), %xmm4 7453; AVX2-NEXT: vmovdqa 64(%rdi), %xmm5 7454; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] 7455; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7456; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 7457; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] 7458; AVX2-NEXT: vmovdqa 64(%r8), %xmm0 7459; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7460; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero 7461; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] 7462; AVX2-NEXT: vpbroadcastd 68(%r9), %ymm7 7463; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] 7464; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7465; AVX2-NEXT: vmovdqa 96(%rcx), %xmm0 7466; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7467; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3] 7468; AVX2-NEXT: vmovdqa 96(%rdx), %xmm0 7469; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7470; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,2,2,3] 7471; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] 7472; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,1,2,1] 7473; AVX2-NEXT: vmovdqa 96(%rsi), %xmm6 7474; AVX2-NEXT: vmovdqa 96(%rdi), %xmm7 7475; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] 7476; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7477; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 7478; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] 7479; AVX2-NEXT: vmovdqa 96(%r8), %xmm0 7480; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7481; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero 7482; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] 7483; AVX2-NEXT: vpbroadcastd 100(%r9), %ymm9 7484; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] 7485; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7486; AVX2-NEXT: vmovdqa 128(%rcx), %xmm0 7487; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7488; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,2,2,3] 7489; AVX2-NEXT: vmovdqa 128(%rdx), %xmm0 7490; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7491; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,2,2,3] 7492; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] 7493; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,1,2,1] 7494; AVX2-NEXT: vmovdqa 128(%rsi), %xmm8 7495; AVX2-NEXT: vmovdqa 128(%rdi), %xmm9 7496; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] 7497; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7498; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 7499; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] 7500; AVX2-NEXT: vmovdqa 128(%r8), %xmm0 7501; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7502; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm0[0],zero,xmm0[1],zero 7503; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] 7504; AVX2-NEXT: vpbroadcastd 132(%r9), %ymm11 7505; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7] 7506; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7507; AVX2-NEXT: vmovdqa 160(%rcx), %xmm0 7508; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7509; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[1,2,2,3] 7510; AVX2-NEXT: vmovdqa 160(%rdx), %xmm0 7511; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7512; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[1,2,2,3] 7513; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] 7514; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm10[0,1,2,1] 7515; AVX2-NEXT: vmovdqa 160(%rsi), %xmm10 7516; AVX2-NEXT: vmovdqa 160(%rdi), %xmm11 7517; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] 7518; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7519; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 7520; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] 7521; AVX2-NEXT: vmovdqa 160(%r8), %xmm0 7522; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7523; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero 7524; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] 7525; AVX2-NEXT: vpbroadcastd 164(%r9), %ymm13 7526; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] 7527; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7528; AVX2-NEXT: vmovdqa 192(%rcx), %xmm0 7529; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7530; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[1,2,2,3] 7531; AVX2-NEXT: vmovdqa 192(%rdx), %xmm0 7532; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7533; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3] 7534; AVX2-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] 7535; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,1,2,1] 7536; AVX2-NEXT: vmovdqa 192(%rsi), %xmm12 7537; AVX2-NEXT: vmovdqa 192(%rdi), %xmm13 7538; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] 7539; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7540; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm15 7541; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] 7542; AVX2-NEXT: vmovdqa 192(%r8), %xmm0 7543; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7544; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero 7545; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 7546; AVX2-NEXT: vpbroadcastd 196(%r9), %ymm15 7547; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] 7548; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7549; AVX2-NEXT: vmovdqa 224(%rcx), %xmm0 7550; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7551; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3] 7552; AVX2-NEXT: vmovdqa 224(%rdx), %xmm0 7553; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7554; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,2,2,3] 7555; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 7556; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,2,1] 7557; AVX2-NEXT: vmovdqa 224(%rsi), %xmm14 7558; AVX2-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7559; AVX2-NEXT: vmovdqa 224(%rdi), %xmm0 7560; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7561; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] 7562; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7563; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 7564; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] 7565; AVX2-NEXT: vmovdqa 224(%r8), %xmm14 7566; AVX2-NEXT: vmovdqa %xmm14, (%rsp) # 16-byte Spill 7567; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero 7568; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] 7569; AVX2-NEXT: vpbroadcastd 228(%r9), %ymm14 7570; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3],ymm0[4,5,6,7] 7571; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7572; AVX2-NEXT: vpbroadcastd (%rcx), %xmm0 7573; AVX2-NEXT: vpbroadcastd (%rdx), %xmm14 7574; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] 7575; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload 7576; AVX2-NEXT: # xmm1 = xmm3[0],mem[0],xmm3[1],mem[1] 7577; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] 7578; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 7579; AVX2-NEXT: vpbroadcastq %xmm2, %ymm1 7580; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 7581; AVX2-NEXT: vmovdqa (%r9), %xmm1 7582; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7583; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 7584; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 7585; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7586; AVX2-NEXT: vmovdqa (%rdx), %ymm0 7587; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7588; AVX2-NEXT: vmovdqa (%rcx), %ymm1 7589; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7590; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,2,4,5,6,6] 7591; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,1,2,3,5,5,6,7] 7592; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4],ymm2[5],ymm14[6],ymm2[7] 7593; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 7594; AVX2-NEXT: vmovdqa (%rdi), %ymm0 7595; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7596; AVX2-NEXT: vmovdqa (%rsi), %ymm1 7597; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7598; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 7599; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7600; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] 7601; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero 7602; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5,6,7] 7603; AVX2-NEXT: vpbroadcastd 20(%r9), %ymm14 7604; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] 7605; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7606; AVX2-NEXT: vbroadcastss 32(%rcx), %xmm2 7607; AVX2-NEXT: vbroadcastss 32(%rdx), %xmm14 7608; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] 7609; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7610; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload 7611; AVX2-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1] 7612; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] 7613; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] 7614; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload 7615; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] 7616; AVX2-NEXT: vmovaps 32(%r9), %xmm0 7617; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7618; AVX2-NEXT: vbroadcastss %xmm0, %ymm3 7619; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] 7620; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7621; AVX2-NEXT: vmovdqa 32(%rdx), %ymm0 7622; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7623; AVX2-NEXT: vmovdqa 32(%rcx), %ymm3 7624; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[0,1,2,2,4,5,6,6] 7625; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] 7626; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] 7627; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 7628; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 7629; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7630; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 7631; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7632; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 7633; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7634; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] 7635; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 7636; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 7637; AVX2-NEXT: vpbroadcastd 52(%r9), %ymm15 7638; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] 7639; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7640; AVX2-NEXT: vpbroadcastd 64(%rcx), %xmm14 7641; AVX2-NEXT: vpbroadcastd 64(%rdx), %xmm15 7642; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 7643; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 7644; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] 7645; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7] 7646; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload 7647; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] 7648; AVX2-NEXT: vmovdqa 64(%r9), %xmm0 7649; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7650; AVX2-NEXT: vpbroadcastd %xmm0, %ymm5 7651; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] 7652; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7653; AVX2-NEXT: vmovdqa 64(%rdx), %ymm5 7654; AVX2-NEXT: vmovdqa 64(%rcx), %ymm4 7655; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[0,1,2,2,4,5,6,6] 7656; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[1,1,2,3,5,5,6,7] 7657; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] 7658; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 7659; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 7660; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7661; AVX2-NEXT: vmovdqa 64(%rsi), %ymm1 7662; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7663; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 7664; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7665; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] 7666; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 7667; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 7668; AVX2-NEXT: vpbroadcastd 84(%r9), %ymm15 7669; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] 7670; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7671; AVX2-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 7672; AVX2-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 7673; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 7674; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] 7675; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] 7676; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3],ymm6[4,5,6,7] 7677; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload 7678; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] 7679; AVX2-NEXT: vmovdqa 96(%r9), %xmm0 7680; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7681; AVX2-NEXT: vpbroadcastd %xmm0, %ymm7 7682; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] 7683; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7684; AVX2-NEXT: vmovdqa 96(%rdx), %ymm7 7685; AVX2-NEXT: vmovdqa 96(%rcx), %ymm6 7686; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[0,1,2,2,4,5,6,6] 7687; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,2,3,5,5,6,7] 7688; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] 7689; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 7690; AVX2-NEXT: vmovdqa 96(%rdi), %ymm0 7691; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7692; AVX2-NEXT: vmovdqa 96(%rsi), %ymm1 7693; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7694; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 7695; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7696; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] 7697; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 7698; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 7699; AVX2-NEXT: vpbroadcastd 116(%r9), %ymm15 7700; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] 7701; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7702; AVX2-NEXT: vpbroadcastd 128(%rcx), %xmm14 7703; AVX2-NEXT: vpbroadcastd 128(%rdx), %xmm15 7704; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 7705; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] 7706; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] 7707; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm14[2,3],ymm8[4,5,6,7] 7708; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload 7709; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] 7710; AVX2-NEXT: vmovdqa 128(%r9), %xmm0 7711; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7712; AVX2-NEXT: vpbroadcastd %xmm0, %ymm9 7713; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] 7714; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7715; AVX2-NEXT: vmovdqa 128(%rdx), %ymm9 7716; AVX2-NEXT: vmovdqa 128(%rcx), %ymm8 7717; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[0,1,2,2,4,5,6,6] 7718; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[1,1,2,3,5,5,6,7] 7719; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] 7720; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 7721; AVX2-NEXT: vmovdqa 128(%rdi), %ymm0 7722; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7723; AVX2-NEXT: vmovdqa 128(%rsi), %ymm1 7724; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7725; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 7726; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7727; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] 7728; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 7729; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 7730; AVX2-NEXT: vpbroadcastd 148(%r9), %ymm15 7731; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] 7732; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7733; AVX2-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 7734; AVX2-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 7735; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 7736; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] 7737; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] 7738; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3],ymm10[4,5,6,7] 7739; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 16-byte Folded Reload 7740; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] 7741; AVX2-NEXT: vmovdqa 160(%r9), %xmm0 7742; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7743; AVX2-NEXT: vpbroadcastd %xmm0, %ymm11 7744; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] 7745; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7746; AVX2-NEXT: vmovdqa 160(%rdx), %ymm11 7747; AVX2-NEXT: vmovdqa 160(%rcx), %ymm10 7748; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm10[0,1,2,2,4,5,6,6] 7749; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[1,1,2,3,5,5,6,7] 7750; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] 7751; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 7752; AVX2-NEXT: vmovdqa 160(%rdi), %ymm0 7753; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7754; AVX2-NEXT: vmovdqa 160(%rsi), %ymm1 7755; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7756; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 7757; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7758; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] 7759; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 7760; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 7761; AVX2-NEXT: vpbroadcastd 180(%r9), %ymm15 7762; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] 7763; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7764; AVX2-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 7765; AVX2-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 7766; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 7767; AVX2-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] 7768; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] 7769; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] 7770; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload 7771; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] 7772; AVX2-NEXT: vmovdqa 192(%r9), %xmm0 7773; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7774; AVX2-NEXT: vpbroadcastd %xmm0, %ymm13 7775; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] 7776; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7777; AVX2-NEXT: vmovdqa 192(%rdx), %ymm13 7778; AVX2-NEXT: vmovdqa 192(%rcx), %ymm12 7779; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[0,1,2,2,4,5,6,6] 7780; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[1,1,2,3,5,5,6,7] 7781; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] 7782; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 7783; AVX2-NEXT: vmovdqa 192(%rdi), %ymm0 7784; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7785; AVX2-NEXT: vmovdqa 192(%rsi), %ymm1 7786; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7787; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 7788; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7789; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] 7790; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 7791; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 7792; AVX2-NEXT: vpbroadcastd 212(%r9), %ymm15 7793; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] 7794; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7795; AVX2-NEXT: vbroadcastss 224(%rcx), %xmm14 7796; AVX2-NEXT: vbroadcastss 224(%rdx), %xmm15 7797; AVX2-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 7798; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7799; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload 7800; AVX2-NEXT: # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1] 7801; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] 7802; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] 7803; AVX2-NEXT: vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload 7804; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] 7805; AVX2-NEXT: vmovaps 224(%r9), %xmm0 7806; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7807; AVX2-NEXT: vbroadcastss %xmm0, %ymm15 7808; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] 7809; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7810; AVX2-NEXT: vmovdqa 224(%rdx), %ymm14 7811; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7812; AVX2-NEXT: vmovdqa 224(%rcx), %ymm0 7813; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7814; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] 7815; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[1,1,2,3,5,5,6,7] 7816; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3],ymm15[4],ymm0[5],ymm15[6],ymm0[7] 7817; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,1,2,3] 7818; AVX2-NEXT: vmovdqa 224(%rdi), %ymm14 7819; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7820; AVX2-NEXT: vmovdqa 224(%rsi), %ymm0 7821; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7822; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm14 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] 7823; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7824; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] 7825; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 7826; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] 7827; AVX2-NEXT: vpbroadcastd 244(%r9), %ymm15 7828; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] 7829; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7830; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7831; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 7832; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 7833; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 7834; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 7835; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 7836; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 7837; AVX2-NEXT: # xmm15 = mem[2,2,3,3] 7838; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] 7839; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5],ymm15[6,7] 7840; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 7841; AVX2-NEXT: # xmm15 = mem[2,2,3,3] 7842; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] 7843; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] 7844; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7845; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7846; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7847; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 7848; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7849; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7850; AVX2-NEXT: vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] 7851; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] 7852; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3] 7853; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] 7854; AVX2-NEXT: vmovaps (%r8), %ymm15 7855; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] 7856; AVX2-NEXT: vbroadcastss 16(%r9), %ymm14 7857; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] 7858; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7859; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] 7860; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 7861; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7862; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 7863; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7] 7864; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] 7865; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 7866; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] 7867; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] 7868; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 7869; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7870; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7871; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 7872; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 7873; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 7874; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7875; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 7876; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7877; AVX2-NEXT: # xmm1 = mem[2,2,3,3] 7878; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 7879; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 7880; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7881; AVX2-NEXT: # xmm1 = mem[2,2,3,3] 7882; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 7883; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 7884; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7885; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7886; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7887; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 7888; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7889; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 7890; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 7891; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 7892; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 7893; AVX2-NEXT: vmovdqa 32(%r8), %ymm1 7894; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 7895; AVX2-NEXT: vpbroadcastd 48(%r9), %ymm14 7896; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] 7897; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7898; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] 7899; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 7900; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7901; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 7902; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] 7903; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 7904; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 7905; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] 7906; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 7907; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 7908; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7909; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7910; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 7911; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 7912; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 7913; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7914; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 7915; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7916; AVX2-NEXT: # xmm1 = mem[2,2,3,3] 7917; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 7918; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 7919; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7920; AVX2-NEXT: # xmm1 = mem[2,2,3,3] 7921; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 7922; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 7923; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7924; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7925; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7926; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 7927; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] 7928; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 7929; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 7930; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 7931; AVX2-NEXT: vmovdqa 64(%r8), %ymm1 7932; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 7933; AVX2-NEXT: vpbroadcastd 80(%r9), %ymm2 7934; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] 7935; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7936; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] 7937; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 7938; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7939; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 7940; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] 7941; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 7942; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 7943; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] 7944; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 7945; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 7946; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7947; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7948; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 7949; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 7950; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 7951; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7952; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 7953; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7954; AVX2-NEXT: # xmm1 = mem[2,2,3,3] 7955; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 7956; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 7957; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7958; AVX2-NEXT: # xmm1 = mem[2,2,3,3] 7959; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 7960; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 7961; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7962; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7963; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7964; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 7965; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] 7966; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 7967; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 7968; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 7969; AVX2-NEXT: vmovdqa 96(%r8), %ymm1 7970; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 7971; AVX2-NEXT: vpbroadcastd 112(%r9), %ymm2 7972; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] 7973; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7974; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] 7975; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 7976; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7977; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 7978; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] 7979; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 7980; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 7981; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] 7982; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 7983; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 7984; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7985; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 7986; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 7987; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 7988; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7989; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 7990; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7991; AVX2-NEXT: # xmm1 = mem[2,2,3,3] 7992; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 7993; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 7994; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7995; AVX2-NEXT: # xmm1 = mem[2,2,3,3] 7996; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 7997; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 7998; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7999; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8000; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 8001; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] 8002; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 8003; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 8004; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 8005; AVX2-NEXT: vmovdqa 128(%r8), %ymm1 8006; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 8007; AVX2-NEXT: vpbroadcastd 144(%r9), %ymm2 8008; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] 8009; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] 8010; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 8011; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8012; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 8013; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] 8014; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 8015; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 8016; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] 8017; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 8018; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 8019; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8020; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 8021; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 8022; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 8023; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8024; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 8025; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8026; AVX2-NEXT: # xmm1 = mem[2,2,3,3] 8027; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 8028; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 8029; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8030; AVX2-NEXT: # xmm1 = mem[2,2,3,3] 8031; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 8032; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 8033; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8034; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8035; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 8036; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] 8037; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 8038; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 8039; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 8040; AVX2-NEXT: vmovdqa 160(%r8), %ymm14 8041; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] 8042; AVX2-NEXT: vpbroadcastd 176(%r9), %ymm1 8043; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 8044; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] 8045; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 8046; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8047; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 8048; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[2,1,3,3,6,5,7,7] 8049; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] 8050; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5],ymm10[6,7] 8051; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7] 8052; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] 8053; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7] 8054; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8055; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 8056; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 8057; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 8058; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 8059; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 8060; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 8061; AVX2-NEXT: # xmm11 = mem[2,2,3,3] 8062; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] 8063; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5],ymm11[6,7] 8064; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 8065; AVX2-NEXT: # xmm11 = mem[2,2,3,3] 8066; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] 8067; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm11[1],ymm0[2,3,4,5,6],ymm11[7] 8068; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8069; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8070; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 8071; AVX2-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] 8072; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] 8073; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 8074; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] 8075; AVX2-NEXT: vmovdqa 192(%r8), %ymm14 8076; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] 8077; AVX2-NEXT: vpbroadcastd 208(%r9), %ymm15 8078; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] 8079; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] 8080; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] 8081; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload 8082; AVX2-NEXT: # ymm12 = mem[2,3],ymm12[2,3] 8083; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[2,1,3,3,6,5,7,7] 8084; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] 8085; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] 8086; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] 8087; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] 8088; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] 8089; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8090; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm13 # 16-byte Folded Reload 8091; AVX2-NEXT: # xmm13 = xmm4[2],mem[2],xmm4[3],mem[3] 8092; AVX2-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] 8093; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 8094; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 8095; AVX2-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload 8096; AVX2-NEXT: # xmm14 = mem[2,2,3,3] 8097; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] 8098; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] 8099; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 8100; AVX2-NEXT: # xmm14 = mem[2,2,3,3] 8101; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] 8102; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7] 8103; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8104; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 8105; AVX2-NEXT: # ymm14 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 8106; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8107; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8108; AVX2-NEXT: vunpcklps {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 8109; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] 8110; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] 8111; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 8112; AVX2-NEXT: vmovaps 224(%r8), %ymm15 8113; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] 8114; AVX2-NEXT: vbroadcastss 240(%r9), %ymm4 8115; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] 8116; AVX2-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 8117; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] 8118; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload 8119; AVX2-NEXT: # ymm14 = mem[2,3],ymm14[2,3] 8120; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] 8121; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] 8122; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] 8123; AVX2-NEXT: vpermilps {{.*#+}} ymm15 = mem[0,2,2,3,4,6,6,7] 8124; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] 8125; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] 8126; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 8127; AVX2-NEXT: vmovaps %ymm14, 1504(%rax) 8128; AVX2-NEXT: vmovaps %ymm4, 1440(%rax) 8129; AVX2-NEXT: vmovaps %ymm13, 1408(%rax) 8130; AVX2-NEXT: vmovdqa %ymm12, 1312(%rax) 8131; AVX2-NEXT: vmovdqa %ymm2, 1248(%rax) 8132; AVX2-NEXT: vmovaps %ymm11, 1216(%rax) 8133; AVX2-NEXT: vmovdqa %ymm10, 1120(%rax) 8134; AVX2-NEXT: vmovdqa %ymm3, 1056(%rax) 8135; AVX2-NEXT: vmovaps %ymm9, 1024(%rax) 8136; AVX2-NEXT: vmovdqa %ymm8, 928(%rax) 8137; AVX2-NEXT: vmovdqa %ymm5, 864(%rax) 8138; AVX2-NEXT: vmovaps %ymm7, 832(%rax) 8139; AVX2-NEXT: vmovdqa %ymm6, 736(%rax) 8140; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8141; AVX2-NEXT: vmovaps %ymm0, 672(%rax) 8142; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8143; AVX2-NEXT: vmovaps %ymm0, 640(%rax) 8144; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8145; AVX2-NEXT: vmovaps %ymm0, 544(%rax) 8146; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8147; AVX2-NEXT: vmovaps %ymm0, 480(%rax) 8148; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8149; AVX2-NEXT: vmovaps %ymm0, 448(%rax) 8150; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8151; AVX2-NEXT: vmovaps %ymm0, 352(%rax) 8152; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8153; AVX2-NEXT: vmovaps %ymm0, 288(%rax) 8154; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8155; AVX2-NEXT: vmovaps %ymm0, 256(%rax) 8156; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8157; AVX2-NEXT: vmovaps %ymm0, 160(%rax) 8158; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8159; AVX2-NEXT: vmovaps %ymm0, 96(%rax) 8160; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8161; AVX2-NEXT: vmovaps %ymm0, 64(%rax) 8162; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8163; AVX2-NEXT: vmovaps %ymm0, 1472(%rax) 8164; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8165; AVX2-NEXT: vmovaps %ymm0, 1344(%rax) 8166; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8167; AVX2-NEXT: vmovaps %ymm0, 1280(%rax) 8168; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8169; AVX2-NEXT: vmovaps %ymm0, 1152(%rax) 8170; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8171; AVX2-NEXT: vmovaps %ymm0, 1088(%rax) 8172; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8173; AVX2-NEXT: vmovaps %ymm0, 960(%rax) 8174; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8175; AVX2-NEXT: vmovaps %ymm0, 896(%rax) 8176; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8177; AVX2-NEXT: vmovaps %ymm0, 768(%rax) 8178; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8179; AVX2-NEXT: vmovaps %ymm0, 704(%rax) 8180; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8181; AVX2-NEXT: vmovaps %ymm0, 576(%rax) 8182; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8183; AVX2-NEXT: vmovaps %ymm0, 512(%rax) 8184; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8185; AVX2-NEXT: vmovaps %ymm0, 384(%rax) 8186; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8187; AVX2-NEXT: vmovaps %ymm0, 320(%rax) 8188; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8189; AVX2-NEXT: vmovaps %ymm0, 192(%rax) 8190; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8191; AVX2-NEXT: vmovaps %ymm0, 128(%rax) 8192; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8193; AVX2-NEXT: vmovaps %ymm0, (%rax) 8194; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8195; AVX2-NEXT: vmovaps %ymm0, 1376(%rax) 8196; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8197; AVX2-NEXT: vmovaps %ymm0, 1184(%rax) 8198; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8199; AVX2-NEXT: vmovaps %ymm0, 992(%rax) 8200; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8201; AVX2-NEXT: vmovaps %ymm0, 800(%rax) 8202; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8203; AVX2-NEXT: vmovaps %ymm0, 608(%rax) 8204; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8205; AVX2-NEXT: vmovaps %ymm0, 416(%rax) 8206; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8207; AVX2-NEXT: vmovaps %ymm0, 224(%rax) 8208; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8209; AVX2-NEXT: vmovaps %ymm0, 32(%rax) 8210; AVX2-NEXT: addq $2504, %rsp # imm = 0x9C8 8211; AVX2-NEXT: vzeroupper 8212; AVX2-NEXT: retq 8213; 8214; AVX2-FP-LABEL: store_i32_stride6_vf64: 8215; AVX2-FP: # %bb.0: 8216; AVX2-FP-NEXT: subq $2504, %rsp # imm = 0x9C8 8217; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm0 8218; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8219; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm1 8220; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8221; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm3 8222; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm10 8223; AVX2-FP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8224; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] 8225; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8226; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 8227; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm0 8228; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8229; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm8 8230; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8231; AVX2-FP-NEXT: vmovdqa 64(%rcx), %xmm7 8232; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8233; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] 8234; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm0 8235; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8236; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm9 8237; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8238; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3] 8239; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 8240; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] 8241; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] 8242; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 8243; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm6 8244; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8245; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero 8246; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8247; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] 8248; AVX2-FP-NEXT: vpbroadcastd 4(%r9), %ymm5 8249; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] 8250; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8251; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] 8252; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8253; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3] 8254; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[1,2,2,3] 8255; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 8256; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 8257; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] 8258; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] 8259; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero 8260; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] 8261; AVX2-FP-NEXT: vpbroadcastd 36(%r9), %ymm5 8262; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] 8263; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8264; AVX2-FP-NEXT: vmovdqa 64(%rdx), %xmm0 8265; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8266; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] 8267; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] 8268; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 8269; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,2,1] 8270; AVX2-FP-NEXT: vmovdqa 64(%rsi), %xmm4 8271; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm5 8272; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] 8273; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8274; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 8275; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] 8276; AVX2-FP-NEXT: vmovdqa 64(%r8), %xmm0 8277; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8278; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero 8279; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] 8280; AVX2-FP-NEXT: vpbroadcastd 68(%r9), %ymm7 8281; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] 8282; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8283; AVX2-FP-NEXT: vmovdqa 96(%rcx), %xmm0 8284; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8285; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3] 8286; AVX2-FP-NEXT: vmovdqa 96(%rdx), %xmm0 8287; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8288; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,2,2,3] 8289; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] 8290; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,1,2,1] 8291; AVX2-FP-NEXT: vmovdqa 96(%rsi), %xmm6 8292; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm7 8293; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] 8294; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8295; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 8296; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] 8297; AVX2-FP-NEXT: vmovdqa 96(%r8), %xmm0 8298; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8299; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero 8300; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] 8301; AVX2-FP-NEXT: vpbroadcastd 100(%r9), %ymm9 8302; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] 8303; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8304; AVX2-FP-NEXT: vmovdqa 128(%rcx), %xmm0 8305; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8306; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,2,2,3] 8307; AVX2-FP-NEXT: vmovdqa 128(%rdx), %xmm0 8308; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8309; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,2,2,3] 8310; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] 8311; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,1,2,1] 8312; AVX2-FP-NEXT: vmovdqa 128(%rsi), %xmm8 8313; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm9 8314; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] 8315; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8316; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 8317; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] 8318; AVX2-FP-NEXT: vmovdqa 128(%r8), %xmm0 8319; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8320; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm0[0],zero,xmm0[1],zero 8321; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] 8322; AVX2-FP-NEXT: vpbroadcastd 132(%r9), %ymm11 8323; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7] 8324; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8325; AVX2-FP-NEXT: vmovdqa 160(%rcx), %xmm0 8326; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8327; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[1,2,2,3] 8328; AVX2-FP-NEXT: vmovdqa 160(%rdx), %xmm0 8329; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8330; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[1,2,2,3] 8331; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] 8332; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[0,1,2,1] 8333; AVX2-FP-NEXT: vmovdqa 160(%rsi), %xmm10 8334; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm11 8335; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] 8336; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8337; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 8338; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] 8339; AVX2-FP-NEXT: vmovdqa 160(%r8), %xmm0 8340; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8341; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero 8342; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] 8343; AVX2-FP-NEXT: vpbroadcastd 164(%r9), %ymm13 8344; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] 8345; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8346; AVX2-FP-NEXT: vmovdqa 192(%rcx), %xmm0 8347; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8348; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[1,2,2,3] 8349; AVX2-FP-NEXT: vmovdqa 192(%rdx), %xmm0 8350; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8351; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3] 8352; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] 8353; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,1,2,1] 8354; AVX2-FP-NEXT: vmovdqa 192(%rsi), %xmm12 8355; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm13 8356; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] 8357; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8358; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm15 8359; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] 8360; AVX2-FP-NEXT: vmovdqa 192(%r8), %xmm0 8361; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8362; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero 8363; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 8364; AVX2-FP-NEXT: vpbroadcastd 196(%r9), %ymm15 8365; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] 8366; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8367; AVX2-FP-NEXT: vmovdqa 224(%rcx), %xmm0 8368; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8369; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3] 8370; AVX2-FP-NEXT: vmovdqa 224(%rdx), %xmm0 8371; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8372; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,2,2,3] 8373; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 8374; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,2,1] 8375; AVX2-FP-NEXT: vmovdqa 224(%rsi), %xmm14 8376; AVX2-FP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8377; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm0 8378; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8379; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] 8380; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8381; AVX2-FP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 8382; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] 8383; AVX2-FP-NEXT: vmovdqa 224(%r8), %xmm14 8384; AVX2-FP-NEXT: vmovdqa %xmm14, (%rsp) # 16-byte Spill 8385; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero 8386; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] 8387; AVX2-FP-NEXT: vpbroadcastd 228(%r9), %ymm14 8388; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3],ymm0[4,5,6,7] 8389; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8390; AVX2-FP-NEXT: vpbroadcastd (%rcx), %xmm0 8391; AVX2-FP-NEXT: vpbroadcastd (%rdx), %xmm14 8392; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] 8393; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload 8394; AVX2-FP-NEXT: # xmm1 = xmm3[0],mem[0],xmm3[1],mem[1] 8395; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] 8396; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 8397; AVX2-FP-NEXT: vpbroadcastq %xmm2, %ymm1 8398; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 8399; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1 8400; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8401; AVX2-FP-NEXT: vpbroadcastd %xmm1, %ymm1 8402; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 8403; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8404; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm0 8405; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8406; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm1 8407; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8408; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,2,4,5,6,6] 8409; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,1,2,3,5,5,6,7] 8410; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4],ymm2[5],ymm14[6],ymm2[7] 8411; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 8412; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 8413; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8414; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm1 8415; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8416; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 8417; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8418; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] 8419; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero 8420; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5,6,7] 8421; AVX2-FP-NEXT: vpbroadcastd 20(%r9), %ymm14 8422; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] 8423; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8424; AVX2-FP-NEXT: vbroadcastss 32(%rcx), %xmm2 8425; AVX2-FP-NEXT: vbroadcastss 32(%rdx), %xmm14 8426; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] 8427; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8428; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload 8429; AVX2-FP-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1] 8430; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] 8431; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] 8432; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload 8433; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] 8434; AVX2-FP-NEXT: vmovaps 32(%r9), %xmm0 8435; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8436; AVX2-FP-NEXT: vbroadcastss %xmm0, %ymm3 8437; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] 8438; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8439; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm0 8440; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8441; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm3 8442; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[0,1,2,2,4,5,6,6] 8443; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] 8444; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] 8445; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 8446; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm0 8447; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8448; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm1 8449; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8450; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 8451; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8452; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] 8453; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 8454; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 8455; AVX2-FP-NEXT: vpbroadcastd 52(%r9), %ymm15 8456; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] 8457; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8458; AVX2-FP-NEXT: vpbroadcastd 64(%rcx), %xmm14 8459; AVX2-FP-NEXT: vpbroadcastd 64(%rdx), %xmm15 8460; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 8461; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 8462; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] 8463; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7] 8464; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload 8465; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] 8466; AVX2-FP-NEXT: vmovdqa 64(%r9), %xmm0 8467; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8468; AVX2-FP-NEXT: vpbroadcastd %xmm0, %ymm5 8469; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] 8470; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8471; AVX2-FP-NEXT: vmovdqa 64(%rdx), %ymm5 8472; AVX2-FP-NEXT: vmovdqa 64(%rcx), %ymm4 8473; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[0,1,2,2,4,5,6,6] 8474; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[1,1,2,3,5,5,6,7] 8475; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] 8476; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 8477; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 8478; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8479; AVX2-FP-NEXT: vmovdqa 64(%rsi), %ymm1 8480; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8481; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 8482; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8483; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] 8484; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 8485; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 8486; AVX2-FP-NEXT: vpbroadcastd 84(%r9), %ymm15 8487; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] 8488; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8489; AVX2-FP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 8490; AVX2-FP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 8491; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 8492; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] 8493; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] 8494; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3],ymm6[4,5,6,7] 8495; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload 8496; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] 8497; AVX2-FP-NEXT: vmovdqa 96(%r9), %xmm0 8498; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8499; AVX2-FP-NEXT: vpbroadcastd %xmm0, %ymm7 8500; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] 8501; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8502; AVX2-FP-NEXT: vmovdqa 96(%rdx), %ymm7 8503; AVX2-FP-NEXT: vmovdqa 96(%rcx), %ymm6 8504; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[0,1,2,2,4,5,6,6] 8505; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,2,3,5,5,6,7] 8506; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] 8507; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 8508; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm0 8509; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8510; AVX2-FP-NEXT: vmovdqa 96(%rsi), %ymm1 8511; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8512; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 8513; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8514; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] 8515; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 8516; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 8517; AVX2-FP-NEXT: vpbroadcastd 116(%r9), %ymm15 8518; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] 8519; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8520; AVX2-FP-NEXT: vpbroadcastd 128(%rcx), %xmm14 8521; AVX2-FP-NEXT: vpbroadcastd 128(%rdx), %xmm15 8522; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 8523; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] 8524; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] 8525; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm14[2,3],ymm8[4,5,6,7] 8526; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload 8527; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] 8528; AVX2-FP-NEXT: vmovdqa 128(%r9), %xmm0 8529; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8530; AVX2-FP-NEXT: vpbroadcastd %xmm0, %ymm9 8531; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] 8532; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8533; AVX2-FP-NEXT: vmovdqa 128(%rdx), %ymm9 8534; AVX2-FP-NEXT: vmovdqa 128(%rcx), %ymm8 8535; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[0,1,2,2,4,5,6,6] 8536; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[1,1,2,3,5,5,6,7] 8537; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] 8538; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 8539; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm0 8540; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8541; AVX2-FP-NEXT: vmovdqa 128(%rsi), %ymm1 8542; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8543; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 8544; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8545; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] 8546; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 8547; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 8548; AVX2-FP-NEXT: vpbroadcastd 148(%r9), %ymm15 8549; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] 8550; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8551; AVX2-FP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 8552; AVX2-FP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 8553; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 8554; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] 8555; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] 8556; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3],ymm10[4,5,6,7] 8557; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 16-byte Folded Reload 8558; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] 8559; AVX2-FP-NEXT: vmovdqa 160(%r9), %xmm0 8560; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8561; AVX2-FP-NEXT: vpbroadcastd %xmm0, %ymm11 8562; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] 8563; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8564; AVX2-FP-NEXT: vmovdqa 160(%rdx), %ymm11 8565; AVX2-FP-NEXT: vmovdqa 160(%rcx), %ymm10 8566; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm10[0,1,2,2,4,5,6,6] 8567; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[1,1,2,3,5,5,6,7] 8568; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] 8569; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 8570; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm0 8571; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8572; AVX2-FP-NEXT: vmovdqa 160(%rsi), %ymm1 8573; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8574; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 8575; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8576; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] 8577; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 8578; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 8579; AVX2-FP-NEXT: vpbroadcastd 180(%r9), %ymm15 8580; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] 8581; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8582; AVX2-FP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 8583; AVX2-FP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 8584; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 8585; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] 8586; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] 8587; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] 8588; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload 8589; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] 8590; AVX2-FP-NEXT: vmovdqa 192(%r9), %xmm0 8591; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8592; AVX2-FP-NEXT: vpbroadcastd %xmm0, %ymm13 8593; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] 8594; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8595; AVX2-FP-NEXT: vmovdqa 192(%rdx), %ymm13 8596; AVX2-FP-NEXT: vmovdqa 192(%rcx), %ymm12 8597; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[0,1,2,2,4,5,6,6] 8598; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[1,1,2,3,5,5,6,7] 8599; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] 8600; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 8601; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm0 8602; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8603; AVX2-FP-NEXT: vmovdqa 192(%rsi), %ymm1 8604; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8605; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 8606; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8607; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] 8608; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 8609; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 8610; AVX2-FP-NEXT: vpbroadcastd 212(%r9), %ymm15 8611; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] 8612; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8613; AVX2-FP-NEXT: vbroadcastss 224(%rcx), %xmm14 8614; AVX2-FP-NEXT: vbroadcastss 224(%rdx), %xmm15 8615; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 8616; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8617; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload 8618; AVX2-FP-NEXT: # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1] 8619; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] 8620; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] 8621; AVX2-FP-NEXT: vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload 8622; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] 8623; AVX2-FP-NEXT: vmovaps 224(%r9), %xmm0 8624; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8625; AVX2-FP-NEXT: vbroadcastss %xmm0, %ymm15 8626; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] 8627; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8628; AVX2-FP-NEXT: vmovdqa 224(%rdx), %ymm14 8629; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8630; AVX2-FP-NEXT: vmovdqa 224(%rcx), %ymm0 8631; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8632; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] 8633; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[1,1,2,3,5,5,6,7] 8634; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3],ymm15[4],ymm0[5],ymm15[6],ymm0[7] 8635; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,1,2,3] 8636; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm14 8637; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8638; AVX2-FP-NEXT: vmovdqa 224(%rsi), %ymm0 8639; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8640; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm14 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] 8641; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8642; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] 8643; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero 8644; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] 8645; AVX2-FP-NEXT: vpbroadcastd 244(%r9), %ymm15 8646; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] 8647; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8648; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8649; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 8650; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 8651; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 8652; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 8653; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 8654; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 8655; AVX2-FP-NEXT: # xmm15 = mem[2,2,3,3] 8656; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] 8657; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5],ymm15[6,7] 8658; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 8659; AVX2-FP-NEXT: # xmm15 = mem[2,2,3,3] 8660; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] 8661; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] 8662; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8663; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8664; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8665; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 8666; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 8667; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8668; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] 8669; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] 8670; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3] 8671; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] 8672; AVX2-FP-NEXT: vmovaps (%r8), %ymm15 8673; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] 8674; AVX2-FP-NEXT: vbroadcastss 16(%r9), %ymm14 8675; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] 8676; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8677; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] 8678; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 8679; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8680; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 8681; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7] 8682; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] 8683; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 8684; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] 8685; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] 8686; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 8687; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8688; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8689; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 8690; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 8691; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 8692; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8693; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 8694; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8695; AVX2-FP-NEXT: # xmm1 = mem[2,2,3,3] 8696; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 8697; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 8698; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8699; AVX2-FP-NEXT: # xmm1 = mem[2,2,3,3] 8700; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 8701; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 8702; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8703; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8704; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8705; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 8706; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 8707; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 8708; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 8709; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 8710; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 8711; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm1 8712; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 8713; AVX2-FP-NEXT: vpbroadcastd 48(%r9), %ymm14 8714; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] 8715; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8716; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] 8717; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 8718; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8719; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 8720; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] 8721; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 8722; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 8723; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] 8724; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 8725; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 8726; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8727; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8728; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 8729; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 8730; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 8731; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8732; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 8733; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8734; AVX2-FP-NEXT: # xmm1 = mem[2,2,3,3] 8735; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 8736; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 8737; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8738; AVX2-FP-NEXT: # xmm1 = mem[2,2,3,3] 8739; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 8740; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 8741; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8742; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8743; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8744; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 8745; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] 8746; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 8747; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 8748; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 8749; AVX2-FP-NEXT: vmovdqa 64(%r8), %ymm1 8750; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 8751; AVX2-FP-NEXT: vpbroadcastd 80(%r9), %ymm2 8752; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] 8753; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8754; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] 8755; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 8756; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8757; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 8758; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] 8759; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 8760; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 8761; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] 8762; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 8763; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 8764; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8765; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8766; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 8767; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 8768; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 8769; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8770; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 8771; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8772; AVX2-FP-NEXT: # xmm1 = mem[2,2,3,3] 8773; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 8774; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 8775; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8776; AVX2-FP-NEXT: # xmm1 = mem[2,2,3,3] 8777; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 8778; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 8779; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8780; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8781; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8782; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 8783; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] 8784; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 8785; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 8786; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 8787; AVX2-FP-NEXT: vmovdqa 96(%r8), %ymm1 8788; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 8789; AVX2-FP-NEXT: vpbroadcastd 112(%r9), %ymm2 8790; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] 8791; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8792; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] 8793; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 8794; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8795; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 8796; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] 8797; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 8798; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 8799; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] 8800; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 8801; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 8802; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8803; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 8804; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 8805; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 8806; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8807; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 8808; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8809; AVX2-FP-NEXT: # xmm1 = mem[2,2,3,3] 8810; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 8811; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 8812; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8813; AVX2-FP-NEXT: # xmm1 = mem[2,2,3,3] 8814; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 8815; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 8816; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8817; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8818; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 8819; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] 8820; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 8821; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 8822; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 8823; AVX2-FP-NEXT: vmovdqa 128(%r8), %ymm1 8824; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 8825; AVX2-FP-NEXT: vpbroadcastd 144(%r9), %ymm2 8826; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] 8827; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] 8828; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 8829; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8830; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 8831; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] 8832; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 8833; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 8834; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] 8835; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 8836; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 8837; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8838; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 8839; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 8840; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 8841; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8842; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 8843; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8844; AVX2-FP-NEXT: # xmm1 = mem[2,2,3,3] 8845; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 8846; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 8847; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8848; AVX2-FP-NEXT: # xmm1 = mem[2,2,3,3] 8849; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 8850; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] 8851; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8852; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8853; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 8854; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] 8855; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 8856; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 8857; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 8858; AVX2-FP-NEXT: vmovdqa 160(%r8), %ymm14 8859; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] 8860; AVX2-FP-NEXT: vpbroadcastd 176(%r9), %ymm1 8861; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 8862; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] 8863; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 8864; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8865; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 8866; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[2,1,3,3,6,5,7,7] 8867; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] 8868; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5],ymm10[6,7] 8869; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7] 8870; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] 8871; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7] 8872; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8873; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 8874; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 8875; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 8876; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 8877; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 8878; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 8879; AVX2-FP-NEXT: # xmm11 = mem[2,2,3,3] 8880; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] 8881; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5],ymm11[6,7] 8882; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 8883; AVX2-FP-NEXT: # xmm11 = mem[2,2,3,3] 8884; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] 8885; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm11[1],ymm0[2,3,4,5,6],ymm11[7] 8886; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8887; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8888; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 8889; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] 8890; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] 8891; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 8892; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] 8893; AVX2-FP-NEXT: vmovdqa 192(%r8), %ymm14 8894; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] 8895; AVX2-FP-NEXT: vpbroadcastd 208(%r9), %ymm15 8896; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] 8897; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] 8898; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] 8899; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload 8900; AVX2-FP-NEXT: # ymm12 = mem[2,3],ymm12[2,3] 8901; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[2,1,3,3,6,5,7,7] 8902; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] 8903; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] 8904; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] 8905; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] 8906; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] 8907; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8908; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm13 # 16-byte Folded Reload 8909; AVX2-FP-NEXT: # xmm13 = xmm4[2],mem[2],xmm4[3],mem[3] 8910; AVX2-FP-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] 8911; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 8912; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 8913; AVX2-FP-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload 8914; AVX2-FP-NEXT: # xmm14 = mem[2,2,3,3] 8915; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] 8916; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] 8917; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 8918; AVX2-FP-NEXT: # xmm14 = mem[2,2,3,3] 8919; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] 8920; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7] 8921; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8922; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 8923; AVX2-FP-NEXT: # ymm14 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 8924; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8925; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8926; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 8927; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] 8928; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] 8929; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] 8930; AVX2-FP-NEXT: vmovaps 224(%r8), %ymm15 8931; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] 8932; AVX2-FP-NEXT: vbroadcastss 240(%r9), %ymm4 8933; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] 8934; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 8935; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] 8936; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload 8937; AVX2-FP-NEXT: # ymm14 = mem[2,3],ymm14[2,3] 8938; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] 8939; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] 8940; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] 8941; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm15 = mem[0,2,2,3,4,6,6,7] 8942; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] 8943; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] 8944; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 8945; AVX2-FP-NEXT: vmovaps %ymm14, 1504(%rax) 8946; AVX2-FP-NEXT: vmovaps %ymm4, 1440(%rax) 8947; AVX2-FP-NEXT: vmovaps %ymm13, 1408(%rax) 8948; AVX2-FP-NEXT: vmovdqa %ymm12, 1312(%rax) 8949; AVX2-FP-NEXT: vmovdqa %ymm2, 1248(%rax) 8950; AVX2-FP-NEXT: vmovaps %ymm11, 1216(%rax) 8951; AVX2-FP-NEXT: vmovdqa %ymm10, 1120(%rax) 8952; AVX2-FP-NEXT: vmovdqa %ymm3, 1056(%rax) 8953; AVX2-FP-NEXT: vmovaps %ymm9, 1024(%rax) 8954; AVX2-FP-NEXT: vmovdqa %ymm8, 928(%rax) 8955; AVX2-FP-NEXT: vmovdqa %ymm5, 864(%rax) 8956; AVX2-FP-NEXT: vmovaps %ymm7, 832(%rax) 8957; AVX2-FP-NEXT: vmovdqa %ymm6, 736(%rax) 8958; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8959; AVX2-FP-NEXT: vmovaps %ymm0, 672(%rax) 8960; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8961; AVX2-FP-NEXT: vmovaps %ymm0, 640(%rax) 8962; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8963; AVX2-FP-NEXT: vmovaps %ymm0, 544(%rax) 8964; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8965; AVX2-FP-NEXT: vmovaps %ymm0, 480(%rax) 8966; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8967; AVX2-FP-NEXT: vmovaps %ymm0, 448(%rax) 8968; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8969; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rax) 8970; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8971; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rax) 8972; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8973; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rax) 8974; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8975; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rax) 8976; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8977; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rax) 8978; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8979; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax) 8980; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8981; AVX2-FP-NEXT: vmovaps %ymm0, 1472(%rax) 8982; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8983; AVX2-FP-NEXT: vmovaps %ymm0, 1344(%rax) 8984; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8985; AVX2-FP-NEXT: vmovaps %ymm0, 1280(%rax) 8986; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8987; AVX2-FP-NEXT: vmovaps %ymm0, 1152(%rax) 8988; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8989; AVX2-FP-NEXT: vmovaps %ymm0, 1088(%rax) 8990; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8991; AVX2-FP-NEXT: vmovaps %ymm0, 960(%rax) 8992; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8993; AVX2-FP-NEXT: vmovaps %ymm0, 896(%rax) 8994; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8995; AVX2-FP-NEXT: vmovaps %ymm0, 768(%rax) 8996; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8997; AVX2-FP-NEXT: vmovaps %ymm0, 704(%rax) 8998; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8999; AVX2-FP-NEXT: vmovaps %ymm0, 576(%rax) 9000; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9001; AVX2-FP-NEXT: vmovaps %ymm0, 512(%rax) 9002; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9003; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rax) 9004; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9005; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rax) 9006; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9007; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rax) 9008; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9009; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rax) 9010; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9011; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) 9012; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9013; AVX2-FP-NEXT: vmovaps %ymm0, 1376(%rax) 9014; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9015; AVX2-FP-NEXT: vmovaps %ymm0, 1184(%rax) 9016; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9017; AVX2-FP-NEXT: vmovaps %ymm0, 992(%rax) 9018; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9019; AVX2-FP-NEXT: vmovaps %ymm0, 800(%rax) 9020; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9021; AVX2-FP-NEXT: vmovaps %ymm0, 608(%rax) 9022; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9023; AVX2-FP-NEXT: vmovaps %ymm0, 416(%rax) 9024; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9025; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax) 9026; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9027; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) 9028; AVX2-FP-NEXT: addq $2504, %rsp # imm = 0x9C8 9029; AVX2-FP-NEXT: vzeroupper 9030; AVX2-FP-NEXT: retq 9031; 9032; AVX2-FCP-LABEL: store_i32_stride6_vf64: 9033; AVX2-FCP: # %bb.0: 9034; AVX2-FCP-NEXT: subq $2376, %rsp # imm = 0x948 9035; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm9 9036; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm1 9037; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9038; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm7 9039; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 9040; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9041; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] 9042; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9043; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9044; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm2 9045; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9046; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm6 9047; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9048; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %xmm5 9049; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9050; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,2,2,3] 9051; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2 9052; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9053; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm8 9054; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9055; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,2,2,3] 9056; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 9057; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] 9058; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] 9059; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 9060; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm11 9061; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero 9062; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] 9063; AVX2-FCP-NEXT: vpbroadcastd 4(%r9), %ymm4 9064; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] 9065; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9066; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] 9067; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9068; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,2,2,3] 9069; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3] 9070; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 9071; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 9072; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] 9073; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] 9074; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero 9075; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] 9076; AVX2-FCP-NEXT: vpbroadcastd 36(%r9), %ymm4 9077; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] 9078; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9079; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %xmm0 9080; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9081; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,2,2,3] 9082; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,2,2,3] 9083; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 9084; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] 9085; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %xmm15 9086; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm13 9087; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] 9088; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9089; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 9090; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] 9091; AVX2-FCP-NEXT: vmovdqa 64(%r8), %xmm4 9092; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero 9093; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] 9094; AVX2-FCP-NEXT: vpbroadcastd 68(%r9), %ymm5 9095; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] 9096; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9097; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %xmm0 9098; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9099; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] 9100; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %xmm0 9101; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9102; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] 9103; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] 9104; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] 9105; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %xmm10 9106; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm12 9107; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] 9108; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9109; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 9110; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] 9111; AVX2-FCP-NEXT: vmovdqa 96(%r8), %xmm8 9112; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero 9113; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] 9114; AVX2-FCP-NEXT: vpbroadcastd 100(%r9), %ymm5 9115; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] 9116; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9117; AVX2-FCP-NEXT: vmovdqa 128(%rcx), %xmm0 9118; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9119; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] 9120; AVX2-FCP-NEXT: vmovdqa 128(%rdx), %xmm0 9121; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9122; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] 9123; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] 9124; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] 9125; AVX2-FCP-NEXT: vmovdqa 128(%rsi), %xmm0 9126; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9127; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm1 9128; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9129; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9130; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9131; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 9132; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] 9133; AVX2-FCP-NEXT: vmovdqa 128(%r8), %xmm5 9134; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero 9135; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] 9136; AVX2-FCP-NEXT: vpbroadcastd 132(%r9), %ymm6 9137; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] 9138; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9139; AVX2-FCP-NEXT: vmovdqa 160(%rcx), %xmm0 9140; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9141; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] 9142; AVX2-FCP-NEXT: vmovdqa 160(%rdx), %xmm0 9143; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9144; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3] 9145; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] 9146; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] 9147; AVX2-FCP-NEXT: vmovdqa 160(%rsi), %xmm0 9148; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9149; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm1 9150; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9151; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9152; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9153; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 9154; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] 9155; AVX2-FCP-NEXT: vmovdqa 160(%r8), %xmm6 9156; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm6[0],zero,xmm6[1],zero 9157; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] 9158; AVX2-FCP-NEXT: vpbroadcastd 164(%r9), %ymm14 9159; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] 9160; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9161; AVX2-FCP-NEXT: vmovdqa 192(%rcx), %xmm0 9162; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9163; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] 9164; AVX2-FCP-NEXT: vmovdqa 192(%rdx), %xmm0 9165; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9166; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3] 9167; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] 9168; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] 9169; AVX2-FCP-NEXT: vmovdqa 192(%rsi), %xmm0 9170; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9171; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm1 9172; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9173; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9174; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9175; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14 9176; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7] 9177; AVX2-FCP-NEXT: vmovdqa 192(%r8), %xmm0 9178; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9179; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero 9180; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] 9181; AVX2-FCP-NEXT: vpbroadcastd 196(%r9), %ymm14 9182; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] 9183; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9184; AVX2-FCP-NEXT: vmovdqa 224(%rcx), %xmm3 9185; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9186; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] 9187; AVX2-FCP-NEXT: vmovdqa 224(%rdx), %xmm0 9188; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9189; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3] 9190; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] 9191; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] 9192; AVX2-FCP-NEXT: vmovdqa 224(%rsi), %xmm0 9193; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9194; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %xmm1 9195; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9196; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9197; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9198; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14 9199; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7] 9200; AVX2-FCP-NEXT: vmovdqa 224(%r8), %xmm0 9201; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 9202; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero 9203; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] 9204; AVX2-FCP-NEXT: vpbroadcastd 228(%r9), %ymm14 9205; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] 9206; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9207; AVX2-FCP-NEXT: vpbroadcastd (%rcx), %xmm3 9208; AVX2-FCP-NEXT: vpbroadcastd (%rdx), %xmm14 9209; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] 9210; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 9211; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] 9212; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] 9213; AVX2-FCP-NEXT: vpbroadcastq %xmm2, %ymm0 9214; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] 9215; AVX2-FCP-NEXT: vpbroadcastd (%r9), %ymm1 9216; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 9217; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9218; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm14 9219; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm0 9220; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9221; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] 9222; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,2,3,5,5,6,7] 9223; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 9224; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 9225; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 9226; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9227; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm2 9228; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9229; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] 9230; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9231; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 9232; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero 9233; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 9234; AVX2-FCP-NEXT: vpbroadcastd 20(%r9), %ymm1 9235; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 9236; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9237; AVX2-FCP-NEXT: vpbroadcastd 32(%rcx), %xmm0 9238; AVX2-FCP-NEXT: vpbroadcastd 32(%rdx), %xmm1 9239; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9240; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9241; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 9242; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 9243; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] 9244; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 9245; AVX2-FCP-NEXT: vpbroadcastq %xmm11, %ymm1 9246; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 9247; AVX2-FCP-NEXT: vpbroadcastd 32(%r9), %ymm1 9248; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 9249; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9250; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm1 9251; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9252; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm0 9253; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9254; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] 9255; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7] 9256; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 9257; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 9258; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 9259; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9260; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 9261; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9262; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] 9263; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9264; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 9265; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero 9266; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 9267; AVX2-FCP-NEXT: vpbroadcastd 52(%r9), %ymm1 9268; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 9269; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9270; AVX2-FCP-NEXT: vpbroadcastd 64(%rcx), %xmm0 9271; AVX2-FCP-NEXT: vpbroadcastd 64(%rdx), %xmm1 9272; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9273; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] 9274; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] 9275; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 9276; AVX2-FCP-NEXT: vpbroadcastq %xmm4, %ymm1 9277; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 9278; AVX2-FCP-NEXT: vpbroadcastd 64(%r9), %ymm1 9279; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 9280; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9281; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %ymm1 9282; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9283; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %ymm0 9284; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9285; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] 9286; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7] 9287; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 9288; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 9289; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 9290; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9291; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %ymm2 9292; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9293; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] 9294; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9295; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 9296; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero 9297; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 9298; AVX2-FCP-NEXT: vpbroadcastd 84(%r9), %ymm1 9299; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] 9300; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9301; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9302; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9303; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9304; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] 9305; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] 9306; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] 9307; AVX2-FCP-NEXT: vpbroadcastq %xmm8, %ymm1 9308; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 9309; AVX2-FCP-NEXT: vpbroadcastd 96(%r9), %ymm1 9310; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 9311; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9312; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %ymm1 9313; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9314; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %ymm2 9315; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] 9316; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] 9317; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] 9318; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 9319; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 9320; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9321; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %ymm3 9322; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9323; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] 9324; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9325; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 9326; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero 9327; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] 9328; AVX2-FCP-NEXT: vpbroadcastd 116(%r9), %ymm4 9329; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] 9330; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9331; AVX2-FCP-NEXT: vpbroadcastd 128(%rcx), %xmm0 9332; AVX2-FCP-NEXT: vpbroadcastd 128(%rdx), %xmm4 9333; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 9334; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9335; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload 9336; AVX2-FCP-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] 9337; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] 9338; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] 9339; AVX2-FCP-NEXT: vpbroadcastq %xmm5, %ymm4 9340; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] 9341; AVX2-FCP-NEXT: vpbroadcastd 128(%r9), %ymm4 9342; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] 9343; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9344; AVX2-FCP-NEXT: vmovdqa 128(%rdx), %ymm12 9345; AVX2-FCP-NEXT: vmovdqa 128(%rcx), %ymm9 9346; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,2,2,4,5,6,6] 9347; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,2,3,5,5,6,7] 9348; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] 9349; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 9350; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 9351; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9352; AVX2-FCP-NEXT: vmovdqa 128(%rsi), %ymm3 9353; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9354; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] 9355; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9356; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 9357; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero 9358; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] 9359; AVX2-FCP-NEXT: vpbroadcastd 148(%r9), %ymm4 9360; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] 9361; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9362; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9363; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 9364; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 9365; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9366; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload 9367; AVX2-FCP-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] 9368; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] 9369; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] 9370; AVX2-FCP-NEXT: vpbroadcastq %xmm6, %ymm4 9371; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] 9372; AVX2-FCP-NEXT: vpbroadcastd 160(%r9), %ymm4 9373; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] 9374; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9375; AVX2-FCP-NEXT: vmovdqa 160(%rdx), %ymm10 9376; AVX2-FCP-NEXT: vmovdqa 160(%rcx), %ymm7 9377; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,1,2,2,4,5,6,6] 9378; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7] 9379; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] 9380; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 9381; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 9382; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9383; AVX2-FCP-NEXT: vmovdqa 160(%rsi), %ymm3 9384; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9385; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] 9386; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9387; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 9388; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero 9389; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] 9390; AVX2-FCP-NEXT: vpbroadcastd 180(%r9), %ymm4 9391; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] 9392; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9393; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 9394; AVX2-FCP-NEXT: vpbroadcastd %xmm11, %xmm0 9395; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 9396; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 9397; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9398; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload 9399; AVX2-FCP-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] 9400; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] 9401; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] 9402; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload 9403; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] 9404; AVX2-FCP-NEXT: vpbroadcastd 192(%r9), %ymm4 9405; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] 9406; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9407; AVX2-FCP-NEXT: vmovdqa 192(%rdx), %ymm1 9408; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9409; AVX2-FCP-NEXT: vmovdqa 192(%rcx), %ymm0 9410; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9411; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] 9412; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] 9413; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] 9414; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 9415; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm1 9416; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9417; AVX2-FCP-NEXT: vmovdqa 192(%rsi), %ymm3 9418; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9419; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] 9420; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9421; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 9422; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero 9423; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] 9424; AVX2-FCP-NEXT: vpbroadcastd 212(%r9), %ymm4 9425; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] 9426; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9427; AVX2-FCP-NEXT: vbroadcastss 224(%rcx), %xmm0 9428; AVX2-FCP-NEXT: vbroadcastss 224(%rdx), %xmm4 9429; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 9430; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9431; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload 9432; AVX2-FCP-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] 9433; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] 9434; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] 9435; AVX2-FCP-NEXT: vbroadcastsd (%rsp), %ymm4 # 16-byte Folded Reload 9436; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] 9437; AVX2-FCP-NEXT: vbroadcastss 224(%r9), %ymm4 9438; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] 9439; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9440; AVX2-FCP-NEXT: vmovdqa 224(%rdx), %ymm1 9441; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9442; AVX2-FCP-NEXT: vmovdqa 224(%rcx), %ymm8 9443; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[0,1,2,2,4,5,6,6] 9444; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] 9445; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] 9446; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 9447; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm1 9448; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9449; AVX2-FCP-NEXT: vmovdqa 224(%rsi), %ymm3 9450; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9451; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] 9452; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill 9453; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 9454; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero 9455; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] 9456; AVX2-FCP-NEXT: vpbroadcastd 244(%r9), %ymm4 9457; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] 9458; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9459; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9460; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 9461; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 9462; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 9463; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9464; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm4 9465; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm0 9466; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3] 9467; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] 9468; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm15 9469; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5],ymm15[6,7] 9470; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm15 9471; AVX2-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm3 9472; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4,5,6],ymm3[7] 9473; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9474; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 9475; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 9476; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5] 9477; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 9478; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm14[0],ymm5[0],ymm14[1],ymm5[1],ymm14[4],ymm5[4],ymm14[5],ymm5[5] 9479; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] 9480; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] 9481; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] 9482; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] 9483; AVX2-FCP-NEXT: vpbroadcastd 16(%r9), %ymm4 9484; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] 9485; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9486; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7] 9487; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] 9488; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 9489; AVX2-FCP-NEXT: # ymm3 = mem[2,3],ymm3[2,3] 9490; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] 9491; AVX2-FCP-NEXT: vpermd %ymm0, %ymm6, %ymm0 9492; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5],ymm0[6,7] 9493; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [4,6,2,3,4,6,6,7] 9494; AVX2-FCP-NEXT: vpermd %ymm15, %ymm5, %ymm15 9495; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] 9496; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9497; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9498; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 9499; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 9500; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 9501; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 9502; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 9503; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm15 9504; AVX2-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm13 9505; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] 9506; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm13 9507; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm14 9508; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4,5,6],ymm14[7] 9509; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9510; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9511; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9512; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 9513; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 9514; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 9515; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] 9516; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] 9517; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 9518; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] 9519; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] 9520; AVX2-FCP-NEXT: vpbroadcastd 48(%r9), %ymm14 9521; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] 9522; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9523; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] 9524; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 9525; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9526; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 9527; AVX2-FCP-NEXT: vpermd %ymm15, %ymm6, %ymm14 9528; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] 9529; AVX2-FCP-NEXT: vpermd %ymm13, %ymm5, %ymm13 9530; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] 9531; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9532; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9533; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 9534; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 9535; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 9536; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 9537; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 9538; AVX2-FCP-NEXT: vmovdqa 64(%r8), %ymm13 9539; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm14 9540; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] 9541; AVX2-FCP-NEXT: vmovdqa 64(%r9), %ymm14 9542; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm15 9543; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] 9544; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9545; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9546; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9547; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 9548; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 9549; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 9550; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] 9551; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] 9552; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 9553; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] 9554; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] 9555; AVX2-FCP-NEXT: vpbroadcastd 80(%r9), %ymm15 9556; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] 9557; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9558; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] 9559; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 9560; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9561; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 9562; AVX2-FCP-NEXT: vpermd %ymm13, %ymm6, %ymm13 9563; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] 9564; AVX2-FCP-NEXT: vpermd %ymm14, %ymm5, %ymm13 9565; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] 9566; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9567; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9568; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 9569; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 9570; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 9571; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 9572; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 9573; AVX2-FCP-NEXT: vmovdqa 96(%r8), %ymm13 9574; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm14 9575; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] 9576; AVX2-FCP-NEXT: vmovdqa 96(%r9), %ymm14 9577; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm15 9578; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] 9579; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9580; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9581; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9582; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 9583; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 9584; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] 9585; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] 9586; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 9587; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] 9588; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] 9589; AVX2-FCP-NEXT: vpbroadcastd 112(%r9), %ymm15 9590; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] 9591; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9592; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] 9593; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 9594; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9595; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 9596; AVX2-FCP-NEXT: vpermd %ymm13, %ymm6, %ymm2 9597; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] 9598; AVX2-FCP-NEXT: vpermd %ymm14, %ymm5, %ymm2 9599; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] 9600; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9601; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9602; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 9603; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 9604; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 9605; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9606; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 9607; AVX2-FCP-NEXT: vmovdqa 128(%r8), %ymm2 9608; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm13 9609; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] 9610; AVX2-FCP-NEXT: vmovdqa 128(%r9), %ymm3 9611; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm13 9612; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] 9613; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9614; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9615; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9616; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 9617; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[4],ymm9[4],ymm12[5],ymm9[5] 9618; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] 9619; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 9620; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] 9621; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] 9622; AVX2-FCP-NEXT: vpbroadcastd 144(%r9), %ymm14 9623; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] 9624; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9625; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[6],ymm9[6],ymm12[7],ymm9[7] 9626; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 9627; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9628; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 9629; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2 9630; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] 9631; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm2 9632; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] 9633; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9634; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 9635; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 9636; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 9637; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9638; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 9639; AVX2-FCP-NEXT: vmovdqa 160(%r8), %ymm3 9640; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm2 9641; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] 9642; AVX2-FCP-NEXT: vmovdqa 160(%r9), %ymm2 9643; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm4 9644; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6],ymm4[7] 9645; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9646; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9647; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 9648; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[4],ymm7[4],ymm10[5],ymm7[5] 9649; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] 9650; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 9651; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] 9652; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] 9653; AVX2-FCP-NEXT: vpbroadcastd 176(%r9), %ymm4 9654; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] 9655; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[6],ymm7[6],ymm10[7],ymm7[7] 9656; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 9657; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9658; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] 9659; AVX2-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm3 9660; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] 9661; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2 9662; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] 9663; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9664; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] 9665; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 9666; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9667; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 9668; AVX2-FCP-NEXT: vmovdqa 192(%r8), %ymm2 9669; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm3 9670; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] 9671; AVX2-FCP-NEXT: vmovdqa 192(%r9), %ymm3 9672; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm10 9673; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7] 9674; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9675; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9676; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 9677; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 9678; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 9679; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] 9680; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] 9681; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 9682; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] 9683; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] 9684; AVX2-FCP-NEXT: vpbroadcastd 208(%r9), %ymm15 9685; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] 9686; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] 9687; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] 9688; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload 9689; AVX2-FCP-NEXT: # ymm15 = mem[2,3],ymm15[2,3] 9690; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2 9691; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2,3,4,5],ymm2[6,7] 9692; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm3 9693; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7] 9694; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 9695; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 9696; AVX2-FCP-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] 9697; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 9698; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 9699; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm11, %ymm3 9700; AVX2-FCP-NEXT: vmovdqa 224(%r8), %ymm15 9701; AVX2-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm13 9702; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3,4,5],ymm13[6,7] 9703; AVX2-FCP-NEXT: vmovdqa 224(%r9), %ymm13 9704; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm1 9705; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4,5,6],ymm1[7] 9706; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9707; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 9708; AVX2-FCP-NEXT: # ymm3 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 9709; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9710; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm0[0],ymm8[0],ymm0[1],ymm8[1],ymm0[4],ymm8[4],ymm0[5],ymm8[5] 9711; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] 9712; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] 9713; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] 9714; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] 9715; AVX2-FCP-NEXT: vpbroadcastd 240(%r9), %ymm11 9716; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5],ymm3[6,7] 9717; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] 9718; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] 9719; AVX2-FCP-NEXT: vperm2i128 $19, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload 9720; AVX2-FCP-NEXT: # ymm8 = mem[2,3],ymm8[2,3] 9721; AVX2-FCP-NEXT: vpermd %ymm15, %ymm6, %ymm6 9722; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5],ymm6[6,7] 9723; AVX2-FCP-NEXT: vpermd %ymm13, %ymm5, %ymm5 9724; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5,6],ymm5[7] 9725; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 9726; AVX2-FCP-NEXT: vmovdqa %ymm5, 1504(%rax) 9727; AVX2-FCP-NEXT: vmovdqa %ymm3, 1440(%rax) 9728; AVX2-FCP-NEXT: vmovdqa %ymm1, 1408(%rax) 9729; AVX2-FCP-NEXT: vmovdqa %ymm2, 1312(%rax) 9730; AVX2-FCP-NEXT: vmovdqa %ymm4, 1248(%rax) 9731; AVX2-FCP-NEXT: vmovdqa %ymm10, 1216(%rax) 9732; AVX2-FCP-NEXT: vmovdqa %ymm7, 1120(%rax) 9733; AVX2-FCP-NEXT: vmovdqa %ymm9, 1056(%rax) 9734; AVX2-FCP-NEXT: vmovdqa %ymm12, 1024(%rax) 9735; AVX2-FCP-NEXT: vmovdqa %ymm14, 928(%rax) 9736; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9737; AVX2-FCP-NEXT: vmovaps %ymm0, 864(%rax) 9738; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9739; AVX2-FCP-NEXT: vmovaps %ymm0, 832(%rax) 9740; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9741; AVX2-FCP-NEXT: vmovaps %ymm0, 736(%rax) 9742; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9743; AVX2-FCP-NEXT: vmovaps %ymm0, 672(%rax) 9744; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9745; AVX2-FCP-NEXT: vmovaps %ymm0, 640(%rax) 9746; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9747; AVX2-FCP-NEXT: vmovaps %ymm0, 544(%rax) 9748; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9749; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%rax) 9750; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9751; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%rax) 9752; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9753; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%rax) 9754; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9755; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rax) 9756; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9757; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rax) 9758; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9759; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rax) 9760; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9761; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rax) 9762; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9763; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax) 9764; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9765; AVX2-FCP-NEXT: vmovaps %ymm0, 1472(%rax) 9766; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9767; AVX2-FCP-NEXT: vmovaps %ymm0, 1344(%rax) 9768; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9769; AVX2-FCP-NEXT: vmovaps %ymm0, 1280(%rax) 9770; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9771; AVX2-FCP-NEXT: vmovaps %ymm0, 1152(%rax) 9772; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9773; AVX2-FCP-NEXT: vmovaps %ymm0, 1088(%rax) 9774; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9775; AVX2-FCP-NEXT: vmovaps %ymm0, 960(%rax) 9776; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9777; AVX2-FCP-NEXT: vmovaps %ymm0, 896(%rax) 9778; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9779; AVX2-FCP-NEXT: vmovaps %ymm0, 768(%rax) 9780; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9781; AVX2-FCP-NEXT: vmovaps %ymm0, 704(%rax) 9782; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9783; AVX2-FCP-NEXT: vmovaps %ymm0, 576(%rax) 9784; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9785; AVX2-FCP-NEXT: vmovaps %ymm0, 512(%rax) 9786; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9787; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rax) 9788; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9789; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rax) 9790; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9791; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rax) 9792; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9793; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rax) 9794; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9795; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) 9796; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9797; AVX2-FCP-NEXT: vmovaps %ymm0, 1376(%rax) 9798; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9799; AVX2-FCP-NEXT: vmovaps %ymm0, 1184(%rax) 9800; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9801; AVX2-FCP-NEXT: vmovaps %ymm0, 992(%rax) 9802; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9803; AVX2-FCP-NEXT: vmovaps %ymm0, 800(%rax) 9804; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9805; AVX2-FCP-NEXT: vmovaps %ymm0, 608(%rax) 9806; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9807; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%rax) 9808; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9809; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax) 9810; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9811; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) 9812; AVX2-FCP-NEXT: addq $2376, %rsp # imm = 0x948 9813; AVX2-FCP-NEXT: vzeroupper 9814; AVX2-FCP-NEXT: retq 9815; 9816; AVX512-LABEL: store_i32_stride6_vf64: 9817; AVX512: # %bb.0: 9818; AVX512-NEXT: subq $456, %rsp # imm = 0x1C8 9819; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm9 9820; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm30 9821; AVX512-NEXT: vmovdqa64 (%rdi), %zmm11 9822; AVX512-NEXT: vmovdqa64 (%rsi), %zmm10 9823; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm8 9824; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm4 9825; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm15 9826; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm28 9827; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm17 9828; AVX512-NEXT: vmovdqa64 (%rdx), %zmm24 9829; AVX512-NEXT: vmovdqa64 (%rcx), %zmm22 9830; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm18 9831; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm16 9832; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm13 9833; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 9834; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] 9835; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 9836; AVX512-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 9837; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 9838; AVX512-NEXT: vmovdqa (%rdx), %ymm14 9839; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 9840; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 9841; AVX512-NEXT: vmovdqa64 %zmm24, %zmm20 9842; AVX512-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 9843; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 9844; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 9845; AVX512-NEXT: vmovdqa64 %zmm24, %zmm23 9846; AVX512-NEXT: vpermt2d %zmm22, %zmm6, %zmm23 9847; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] 9848; AVX512-NEXT: vmovdqa64 %zmm24, %zmm5 9849; AVX512-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 9850; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9851; AVX512-NEXT: vmovdqa64 %zmm17, %zmm25 9852; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 9853; AVX512-NEXT: vmovdqa64 %zmm17, %zmm26 9854; AVX512-NEXT: vpermt2d %zmm18, %zmm6, %zmm26 9855; AVX512-NEXT: vmovdqa64 %zmm17, %zmm5 9856; AVX512-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 9857; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9858; AVX512-NEXT: vmovdqa64 %zmm28, %zmm5 9859; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 9860; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9861; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 9862; AVX512-NEXT: vmovdqa64 %zmm28, %zmm27 9863; AVX512-NEXT: vpermt2d %zmm16, %zmm6, %zmm27 9864; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 9865; AVX512-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 9866; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9867; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 9868; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 9869; AVX512-NEXT: vmovdqa64 %zmm24, %zmm19 9870; AVX512-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 9871; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 9872; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 9873; AVX512-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 9874; AVX512-NEXT: vmovdqa64 %zmm17, %zmm22 9875; AVX512-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 9876; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm17 9877; AVX512-NEXT: vmovdqa64 %zmm28, %zmm18 9878; AVX512-NEXT: vpermt2d %zmm16, %zmm21, %zmm18 9879; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm28 9880; AVX512-NEXT: vmovdqa 64(%rdx), %ymm1 9881; AVX512-NEXT: vpermi2d %zmm13, %zmm15, %zmm5 9882; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9883; AVX512-NEXT: vpermi2d %zmm13, %zmm15, %zmm6 9884; AVX512-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill 9885; AVX512-NEXT: vpermi2d %zmm13, %zmm15, %zmm2 9886; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9887; AVX512-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 9888; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 9889; AVX512-NEXT: vmovdqa 128(%rdx), %ymm0 9890; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] 9891; AVX512-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 9892; AVX512-NEXT: movb $36, %al 9893; AVX512-NEXT: kmovw %eax, %k1 9894; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7] 9895; AVX512-NEXT: vmovdqa64 %zmm30, %zmm13 9896; AVX512-NEXT: vpermt2d %zmm8, %zmm29, %zmm13 9897; AVX512-NEXT: vpermt2d 64(%rcx), %ymm2, %ymm1 9898; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7] 9899; AVX512-NEXT: vmovdqa64 %zmm9, %zmm14 9900; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 9901; AVX512-NEXT: vpermt2d %zmm4, %zmm29, %zmm14 9902; AVX512-NEXT: vpermt2d 128(%rcx), %ymm2, %ymm0 9903; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7] 9904; AVX512-NEXT: vmovdqa 192(%rdx), %ymm0 9905; AVX512-NEXT: vpermt2d 192(%rcx), %ymm2, %ymm0 9906; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm12 9907; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm7 9908; AVX512-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 9909; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] 9910; AVX512-NEXT: vmovdqa64 (%r8), %zmm4 9911; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 9912; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 9913; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9914; AVX512-NEXT: vmovdqa64 64(%r8), %zmm3 9915; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 9916; AVX512-NEXT: vmovdqa64 128(%r8), %zmm2 9917; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 9918; AVX512-NEXT: vmovdqa64 192(%r8), %zmm6 9919; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 9920; AVX512-NEXT: vmovdqa64 %zmm11, %zmm16 9921; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 9922; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 9923; AVX512-NEXT: vpermt2d %zmm10, %zmm31, %zmm16 9924; AVX512-NEXT: movb $-110, %al 9925; AVX512-NEXT: kmovw %eax, %k2 9926; AVX512-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} 9927; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 9928; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 9929; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 9930; AVX512-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 9931; AVX512-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} 9932; AVX512-NEXT: vmovdqa64 %zmm11, %zmm23 9933; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15] 9934; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9935; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 9936; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 9937; AVX512-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 9938; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9939; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} 9940; AVX512-NEXT: vmovdqa64 %zmm30, %zmm10 9941; AVX512-NEXT: vpermt2d %zmm8, %zmm31, %zmm10 9942; AVX512-NEXT: vmovdqa64 %zmm25, %zmm10 {%k2} 9943; AVX512-NEXT: vmovdqa64 %zmm30, %zmm25 9944; AVX512-NEXT: vpermt2d %zmm8, %zmm5, %zmm25 9945; AVX512-NEXT: vmovdqa64 %zmm26, %zmm25 {%k2} 9946; AVX512-NEXT: vmovdqa64 %zmm30, %zmm26 9947; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15] 9948; AVX512-NEXT: vpermt2d %zmm8, %zmm11, %zmm26 9949; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9950; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} 9951; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 9952; AVX512-NEXT: vmovdqa64 %zmm9, %zmm8 9953; AVX512-NEXT: vpermt2d %zmm1, %zmm31, %zmm8 9954; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 9955; AVX512-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} 9956; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 9957; AVX512-NEXT: vpermt2d %zmm1, %zmm5, %zmm9 9958; AVX512-NEXT: vmovdqa64 %zmm27, %zmm9 {%k2} 9959; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 9960; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 9961; AVX512-NEXT: vpermt2d %zmm1, %zmm11, %zmm27 9962; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 9963; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} 9964; AVX512-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 9965; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 9966; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} 9967; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 9968; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 9969; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 9970; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 9971; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm31 9972; AVX512-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 9973; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload 9974; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} 9975; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 9976; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 9977; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 9978; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 9979; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 9980; AVX512-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 9981; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 9982; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} 9983; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 9984; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 9985; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 9986; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 9987; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 9988; AVX512-NEXT: vmovdqa (%rdi), %ymm1 9989; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 9990; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3] 9991; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1 9992; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 9993; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3] 9994; AVX512-NEXT: vmovdqa 128(%rdi), %ymm1 9995; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 9996; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3] 9997; AVX512-NEXT: vmovdqa 192(%rdi), %ymm1 9998; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 9999; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] 10000; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 10001; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 10002; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 10003; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 10004; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 10005; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 10006; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] 10007; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 10008; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 10009; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] 10010; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 10011; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7] 10012; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm28 10013; AVX512-NEXT: vmovdqa64 (%r9), %zmm2 10014; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15] 10015; AVX512-NEXT: vmovdqa64 64(%r9), %zmm4 10016; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7] 10017; AVX512-NEXT: vmovdqa64 128(%r9), %zmm3 10018; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 10019; AVX512-NEXT: vmovdqa64 192(%r9), %zmm1 10020; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 10021; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 10022; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 10023; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 10024; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 10025; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 10026; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 10027; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 10028; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 10029; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 10030; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 10031; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 10032; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 10033; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 10034; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 10035; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 10036; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 10037; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 10038; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 10039; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 10040; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 10041; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 10042; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 10043; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 10044; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 10045; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 10046; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 10047; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 10048; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 10049; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 10050; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm15 10051; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 10052; AVX512-NEXT: vmovdqa64 %zmm15, 1472(%rax) 10053; AVX512-NEXT: vmovdqa64 %zmm11, 1408(%rax) 10054; AVX512-NEXT: vmovdqa64 %zmm5, 1344(%rax) 10055; AVX512-NEXT: vmovdqa64 %zmm31, 1152(%rax) 10056; AVX512-NEXT: vmovdqa64 %zmm28, 1088(%rax) 10057; AVX512-NEXT: vmovdqa64 %zmm27, 1024(%rax) 10058; AVX512-NEXT: vmovdqa64 %zmm9, 960(%rax) 10059; AVX512-NEXT: vmovdqa64 %zmm8, 768(%rax) 10060; AVX512-NEXT: vmovdqa64 %zmm17, 704(%rax) 10061; AVX512-NEXT: vmovdqa64 %zmm26, 640(%rax) 10062; AVX512-NEXT: vmovdqa64 %zmm25, 576(%rax) 10063; AVX512-NEXT: vmovdqa64 %zmm10, 384(%rax) 10064; AVX512-NEXT: vmovdqa64 %zmm24, 320(%rax) 10065; AVX512-NEXT: vmovdqa64 %zmm23, 256(%rax) 10066; AVX512-NEXT: vmovdqa64 %zmm20, 192(%rax) 10067; AVX512-NEXT: vmovdqa64 %zmm16, (%rax) 10068; AVX512-NEXT: vmovdqa64 %zmm21, 1280(%rax) 10069; AVX512-NEXT: vmovdqa64 %zmm29, 1216(%rax) 10070; AVX512-NEXT: vmovdqa64 %zmm18, 896(%rax) 10071; AVX512-NEXT: vmovdqa64 %zmm14, 832(%rax) 10072; AVX512-NEXT: vmovdqa64 %zmm22, 512(%rax) 10073; AVX512-NEXT: vmovdqa64 %zmm13, 448(%rax) 10074; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rax) 10075; AVX512-NEXT: vmovdqa64 %zmm6, 64(%rax) 10076; AVX512-NEXT: addq $456, %rsp # imm = 0x1C8 10077; AVX512-NEXT: vzeroupper 10078; AVX512-NEXT: retq 10079; 10080; AVX512-FCP-LABEL: store_i32_stride6_vf64: 10081; AVX512-FCP: # %bb.0: 10082; AVX512-FCP-NEXT: subq $1160, %rsp # imm = 0x488 10083; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 10084; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 10085; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 10086; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 10087; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 10088; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 10089; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 10090; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 10091; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 10092; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 10093; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 10094; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 10095; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 10096; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 10097; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10098; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 10099; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 10100; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 10101; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 10102; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 10103; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] 10104; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 10105; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 10106; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 10107; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 10108; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 10109; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] 10110; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10111; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 10112; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 10113; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 10114; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 10115; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 10116; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 10117; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 10118; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 10119; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 10120; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 10121; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 10122; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 10123; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 10124; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10125; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] 10126; AVX512-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10127; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 10128; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 10129; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 10130; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10131; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 10132; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 10133; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10134; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 10135; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 10136; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10137; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 10138; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 10139; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10140; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 10141; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 10142; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10143; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 10144; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10145; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 10146; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 10147; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm7, %zmm1 10148; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10149; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 10150; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 10151; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10152; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 10153; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 10154; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10155; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 10156; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 10157; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 10158; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 10159; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 10160; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10161; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 10162; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 10163; AVX512-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 10164; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10165; AVX512-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 10166; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10167; AVX512-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 10168; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10169; AVX512-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 10170; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10171; AVX512-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 10172; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10173; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm30, %zmm2 10174; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10175; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 10176; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] 10177; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 10178; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 10179; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 10180; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 10181; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] 10182; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 10183; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 10184; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] 10185; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 10186; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 10187; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 10188; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 10189; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm5 10190; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm21 10191; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 10192; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 10193; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 10194; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm14 10195; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 10196; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 10197; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm30 10198; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 10199; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 10200; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 10201; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 10202; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm6 10203; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 10204; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm13 10205; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 10206; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 10207; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 10208; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 10209; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 10210; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 10211; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 10212; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 10213; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 10214; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 10215; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 10216; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 10217; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm1 10218; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 10219; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 10220; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm24 10221; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 10222; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 10223; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 10224; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 10225; AVX512-FCP-NEXT: movb $-110, %al 10226; AVX512-FCP-NEXT: kmovw %eax, %k2 10227; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10228; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} 10229; AVX512-FCP-NEXT: movb $36, %al 10230; AVX512-FCP-NEXT: kmovw %eax, %k1 10231; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} 10232; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} 10233; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 {%k2} 10234; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload 10235; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} 10236; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} 10237; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm0 10238; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 10239; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 10240; AVX512-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill 10241; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 10242; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 10243; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10244; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 10245; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 10246; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 10247; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 10248; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10249; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 10250; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm19 10251; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10252; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 10253; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 10254; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 10255; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} 10256; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload 10257; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} 10258; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10259; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} 10260; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 10261; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} 10262; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 10263; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} 10264; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10265; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} 10266; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 10267; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 10268; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 10269; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 10270; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 10271; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 10272; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm10 10273; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 10274; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 10275; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 10276; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 10277; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} 10278; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload 10279; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} 10280; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10281; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 10282; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm26 {%k2} 10283; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 {%k1} 10284; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10285; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} 10286; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 10287; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 10288; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 10289; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 10290; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 10291; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm28 10292; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm13 10293; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 10294; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} 10295; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm0 10296; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 10297; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 10298; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 10299; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} 10300; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm2 10301; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 10302; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 10303; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} 10304; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm29 10305; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 10306; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} 10307; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm2 10308; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 10309; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 10310; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 10311; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload 10312; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 10313; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 10314; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} 10315; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 10316; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 10317; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 10318; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 10319; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 10320; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 10321; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 10322; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 10323; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} 10324; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 10325; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload 10326; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 10327; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 10328; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 10329; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 10330; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 10331; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 10332; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 10333; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 10334; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 10335; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm22 10336; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 10337; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm19 10338; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm17 10339; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 10340; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm2 10341; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm16 10342; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm15 10343; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 10344; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm26 10345; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm28 10346; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 10347; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm2 10348; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm9 10349; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm10 10350; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm29 10351; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm12 10352; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 10353; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 10354; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 10355; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 1472(%rax) 10356; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 1408(%rax) 10357; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 1344(%rax) 10358; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 1280(%rax) 10359; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 1216(%rax) 10360; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 1152(%rax) 10361; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 1088(%rax) 10362; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) 10363; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 960(%rax) 10364; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 896(%rax) 10365; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 832(%rax) 10366; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 768(%rax) 10367; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 704(%rax) 10368; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 640(%rax) 10369; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 576(%rax) 10370; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 512(%rax) 10371; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 448(%rax) 10372; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 384(%rax) 10373; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) 10374; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 256(%rax) 10375; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 192(%rax) 10376; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 128(%rax) 10377; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) 10378; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rax) 10379; AVX512-FCP-NEXT: addq $1160, %rsp # imm = 0x488 10380; AVX512-FCP-NEXT: vzeroupper 10381; AVX512-FCP-NEXT: retq 10382; 10383; AVX512DQ-LABEL: store_i32_stride6_vf64: 10384; AVX512DQ: # %bb.0: 10385; AVX512DQ-NEXT: subq $456, %rsp # imm = 0x1C8 10386; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm9 10387; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm30 10388; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm11 10389; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm10 10390; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm8 10391; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm4 10392; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm15 10393; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm28 10394; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm17 10395; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm24 10396; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm22 10397; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm18 10398; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm16 10399; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm13 10400; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 10401; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] 10402; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 10403; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 10404; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 10405; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm14 10406; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 10407; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 10408; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm20 10409; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 10410; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 10411; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 10412; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm23 10413; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm6, %zmm23 10414; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] 10415; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm5 10416; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 10417; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10418; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm25 10419; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 10420; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm26 10421; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm6, %zmm26 10422; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm5 10423; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 10424; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10425; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm5 10426; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 10427; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10428; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 10429; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm27 10430; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm6, %zmm27 10431; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 10432; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 10433; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10434; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 10435; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 10436; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm19 10437; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 10438; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 10439; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 10440; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 10441; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm22 10442; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 10443; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm17 10444; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm18 10445; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm21, %zmm18 10446; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm28 10447; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm1 10448; AVX512DQ-NEXT: vpermi2d %zmm13, %zmm15, %zmm5 10449; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10450; AVX512DQ-NEXT: vpermi2d %zmm13, %zmm15, %zmm6 10451; AVX512DQ-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill 10452; AVX512DQ-NEXT: vpermi2d %zmm13, %zmm15, %zmm2 10453; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10454; AVX512DQ-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 10455; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 10456; AVX512DQ-NEXT: vmovdqa 128(%rdx), %ymm0 10457; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] 10458; AVX512DQ-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 10459; AVX512DQ-NEXT: movb $36, %al 10460; AVX512DQ-NEXT: kmovw %eax, %k1 10461; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7] 10462; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm13 10463; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm29, %zmm13 10464; AVX512DQ-NEXT: vpermt2d 64(%rcx), %ymm2, %ymm1 10465; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7] 10466; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm14 10467; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 10468; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm29, %zmm14 10469; AVX512DQ-NEXT: vpermt2d 128(%rcx), %ymm2, %ymm0 10470; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7] 10471; AVX512DQ-NEXT: vmovdqa 192(%rdx), %ymm0 10472; AVX512DQ-NEXT: vpermt2d 192(%rcx), %ymm2, %ymm0 10473; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm12 10474; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm7 10475; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 10476; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] 10477; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm4 10478; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 10479; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 10480; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10481; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm3 10482; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 10483; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm2 10484; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 10485; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm6 10486; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 10487; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm16 10488; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 10489; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 10490; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm31, %zmm16 10491; AVX512DQ-NEXT: movb $-110, %al 10492; AVX512DQ-NEXT: kmovw %eax, %k2 10493; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} 10494; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm20 10495; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 10496; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 10497; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 10498; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} 10499; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm23 10500; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15] 10501; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10502; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 10503; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 10504; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 10505; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10506; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} 10507; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm10 10508; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm31, %zmm10 10509; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm10 {%k2} 10510; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm25 10511; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm5, %zmm25 10512; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm25 {%k2} 10513; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm26 10514; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15] 10515; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm11, %zmm26 10516; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10517; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} 10518; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 10519; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm8 10520; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm31, %zmm8 10521; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 10522; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} 10523; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 10524; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm5, %zmm9 10525; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm9 {%k2} 10526; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 10527; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 10528; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm11, %zmm27 10529; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 10530; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} 10531; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 10532; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 10533; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} 10534; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 10535; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 10536; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 10537; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 10538; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm31 10539; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 10540; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload 10541; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} 10542; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 10543; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 10544; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 10545; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 10546; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 10547; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 10548; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 10549; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} 10550; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 10551; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 10552; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 10553; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 10554; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 10555; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 10556; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 10557; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3] 10558; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1 10559; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 10560; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3] 10561; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm1 10562; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 10563; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3] 10564; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm1 10565; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 10566; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] 10567; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 10568; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 10569; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 10570; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 10571; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 10572; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 10573; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] 10574; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 10575; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 10576; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] 10577; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 10578; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7] 10579; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm28 10580; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm2 10581; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15] 10582; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm4 10583; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7] 10584; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm3 10585; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 10586; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm1 10587; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 10588; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 10589; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 10590; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 10591; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 10592; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 10593; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 10594; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 10595; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 10596; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 10597; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 10598; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 10599; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 10600; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 10601; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 10602; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 10603; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 10604; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 10605; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 10606; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 10607; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 10608; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 10609; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 10610; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 10611; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 10612; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 10613; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 10614; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 10615; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 10616; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 10617; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm15 10618; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 10619; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1472(%rax) 10620; AVX512DQ-NEXT: vmovdqa64 %zmm11, 1408(%rax) 10621; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1344(%rax) 10622; AVX512DQ-NEXT: vmovdqa64 %zmm31, 1152(%rax) 10623; AVX512DQ-NEXT: vmovdqa64 %zmm28, 1088(%rax) 10624; AVX512DQ-NEXT: vmovdqa64 %zmm27, 1024(%rax) 10625; AVX512DQ-NEXT: vmovdqa64 %zmm9, 960(%rax) 10626; AVX512DQ-NEXT: vmovdqa64 %zmm8, 768(%rax) 10627; AVX512DQ-NEXT: vmovdqa64 %zmm17, 704(%rax) 10628; AVX512DQ-NEXT: vmovdqa64 %zmm26, 640(%rax) 10629; AVX512DQ-NEXT: vmovdqa64 %zmm25, 576(%rax) 10630; AVX512DQ-NEXT: vmovdqa64 %zmm10, 384(%rax) 10631; AVX512DQ-NEXT: vmovdqa64 %zmm24, 320(%rax) 10632; AVX512DQ-NEXT: vmovdqa64 %zmm23, 256(%rax) 10633; AVX512DQ-NEXT: vmovdqa64 %zmm20, 192(%rax) 10634; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rax) 10635; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1280(%rax) 10636; AVX512DQ-NEXT: vmovdqa64 %zmm29, 1216(%rax) 10637; AVX512DQ-NEXT: vmovdqa64 %zmm18, 896(%rax) 10638; AVX512DQ-NEXT: vmovdqa64 %zmm14, 832(%rax) 10639; AVX512DQ-NEXT: vmovdqa64 %zmm22, 512(%rax) 10640; AVX512DQ-NEXT: vmovdqa64 %zmm13, 448(%rax) 10641; AVX512DQ-NEXT: vmovdqa64 %zmm19, 128(%rax) 10642; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rax) 10643; AVX512DQ-NEXT: addq $456, %rsp # imm = 0x1C8 10644; AVX512DQ-NEXT: vzeroupper 10645; AVX512DQ-NEXT: retq 10646; 10647; AVX512DQ-FCP-LABEL: store_i32_stride6_vf64: 10648; AVX512DQ-FCP: # %bb.0: 10649; AVX512DQ-FCP-NEXT: subq $1160, %rsp # imm = 0x488 10650; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 10651; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 10652; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 10653; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 10654; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 10655; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 10656; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 10657; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 10658; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 10659; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 10660; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 10661; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 10662; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 10663; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 10664; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10665; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 10666; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 10667; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 10668; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 10669; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 10670; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] 10671; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 10672; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 10673; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 10674; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 10675; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 10676; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] 10677; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10678; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 10679; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 10680; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 10681; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 10682; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 10683; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 10684; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 10685; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 10686; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 10687; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 10688; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 10689; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 10690; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 10691; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10692; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] 10693; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 10694; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 10695; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 10696; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 10697; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10698; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 10699; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 10700; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10701; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 10702; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 10703; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10704; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 10705; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 10706; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10707; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 10708; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 10709; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10710; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 10711; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10712; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 10713; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 10714; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm7, %zmm1 10715; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10716; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 10717; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 10718; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10719; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 10720; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 10721; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10722; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 10723; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 10724; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 10725; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 10726; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 10727; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10728; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 10729; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 10730; AVX512DQ-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 10731; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10732; AVX512DQ-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 10733; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10734; AVX512DQ-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 10735; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10736; AVX512DQ-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 10737; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10738; AVX512DQ-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 10739; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10740; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm30, %zmm2 10741; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10742; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 10743; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] 10744; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 10745; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 10746; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 10747; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 10748; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] 10749; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 10750; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 10751; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] 10752; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 10753; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 10754; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 10755; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 10756; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm5 10757; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm21 10758; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 10759; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 10760; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 10761; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm14 10762; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 10763; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 10764; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm30 10765; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 10766; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 10767; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 10768; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 10769; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm6 10770; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 10771; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm13 10772; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 10773; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 10774; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 10775; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 10776; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 10777; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 10778; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 10779; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 10780; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 10781; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 10782; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 10783; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 10784; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm1 10785; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 10786; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 10787; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm24 10788; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 10789; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 10790; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 10791; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 10792; AVX512DQ-FCP-NEXT: movb $-110, %al 10793; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 10794; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10795; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} 10796; AVX512DQ-FCP-NEXT: movb $36, %al 10797; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 10798; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} 10799; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} 10800; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 {%k2} 10801; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload 10802; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} 10803; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} 10804; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm0 10805; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 10806; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 10807; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill 10808; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 10809; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 10810; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10811; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 10812; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 10813; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 10814; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 10815; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10816; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 10817; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm19 10818; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10819; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 10820; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 10821; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 10822; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} 10823; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload 10824; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} 10825; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10826; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} 10827; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 10828; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} 10829; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 10830; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} 10831; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10832; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} 10833; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 10834; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 10835; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 10836; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 10837; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 10838; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 10839; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm10 10840; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 10841; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 10842; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 10843; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 10844; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} 10845; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload 10846; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} 10847; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10848; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 10849; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm26 {%k2} 10850; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 {%k1} 10851; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10852; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} 10853; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 10854; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 10855; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 10856; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 10857; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 10858; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm28 10859; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm13 10860; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 10861; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} 10862; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm0 10863; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 10864; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 10865; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 10866; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} 10867; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm2 10868; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 10869; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 10870; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} 10871; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm29 10872; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 10873; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} 10874; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm2 10875; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 10876; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 10877; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 10878; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload 10879; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 10880; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 10881; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} 10882; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 10883; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 10884; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 10885; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 10886; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 10887; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 10888; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 10889; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 10890; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} 10891; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 10892; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload 10893; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 10894; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 10895; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 10896; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 10897; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 10898; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 10899; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 10900; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 10901; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 10902; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm22 10903; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 10904; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm19 10905; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm17 10906; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 10907; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm2 10908; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm16 10909; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm15 10910; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 10911; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm26 10912; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm28 10913; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 10914; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm2 10915; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm9 10916; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm10 10917; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm29 10918; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm12 10919; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 10920; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 10921; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 10922; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 1472(%rax) 10923; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 1408(%rax) 10924; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 1344(%rax) 10925; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 1280(%rax) 10926; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 1216(%rax) 10927; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 1152(%rax) 10928; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 1088(%rax) 10929; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) 10930; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 960(%rax) 10931; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 896(%rax) 10932; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 832(%rax) 10933; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 768(%rax) 10934; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 704(%rax) 10935; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 640(%rax) 10936; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 576(%rax) 10937; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 512(%rax) 10938; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 448(%rax) 10939; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 384(%rax) 10940; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) 10941; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 256(%rax) 10942; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 192(%rax) 10943; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 128(%rax) 10944; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) 10945; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rax) 10946; AVX512DQ-FCP-NEXT: addq $1160, %rsp # imm = 0x488 10947; AVX512DQ-FCP-NEXT: vzeroupper 10948; AVX512DQ-FCP-NEXT: retq 10949; 10950; AVX512BW-LABEL: store_i32_stride6_vf64: 10951; AVX512BW: # %bb.0: 10952; AVX512BW-NEXT: subq $456, %rsp # imm = 0x1C8 10953; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 10954; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm30 10955; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11 10956; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm10 10957; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm8 10958; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm4 10959; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm15 10960; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm28 10961; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm17 10962; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm24 10963; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm22 10964; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm18 10965; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm16 10966; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm13 10967; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 10968; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] 10969; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 10970; AVX512BW-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 10971; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 10972; AVX512BW-NEXT: vmovdqa (%rdx), %ymm14 10973; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 10974; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 10975; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm20 10976; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 10977; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 10978; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 10979; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 10980; AVX512BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm23 10981; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] 10982; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 10983; AVX512BW-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 10984; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10985; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm25 10986; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 10987; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm26 10988; AVX512BW-NEXT: vpermt2d %zmm18, %zmm6, %zmm26 10989; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 10990; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 10991; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10992; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm5 10993; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 10994; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10995; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 10996; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm27 10997; AVX512BW-NEXT: vpermt2d %zmm16, %zmm6, %zmm27 10998; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 10999; AVX512BW-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 11000; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11001; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 11002; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 11003; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm19 11004; AVX512BW-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 11005; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 11006; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 11007; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 11008; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm22 11009; AVX512BW-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 11010; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm17 11011; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm18 11012; AVX512BW-NEXT: vpermt2d %zmm16, %zmm21, %zmm18 11013; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm28 11014; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm1 11015; AVX512BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm5 11016; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11017; AVX512BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm6 11018; AVX512BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill 11019; AVX512BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm2 11020; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11021; AVX512BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 11022; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 11023; AVX512BW-NEXT: vmovdqa 128(%rdx), %ymm0 11024; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] 11025; AVX512BW-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 11026; AVX512BW-NEXT: movb $36, %al 11027; AVX512BW-NEXT: kmovd %eax, %k1 11028; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7] 11029; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 11030; AVX512BW-NEXT: vpermt2d %zmm8, %zmm29, %zmm13 11031; AVX512BW-NEXT: vpermt2d 64(%rcx), %ymm2, %ymm1 11032; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7] 11033; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 11034; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 11035; AVX512BW-NEXT: vpermt2d %zmm4, %zmm29, %zmm14 11036; AVX512BW-NEXT: vpermt2d 128(%rcx), %ymm2, %ymm0 11037; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7] 11038; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm0 11039; AVX512BW-NEXT: vpermt2d 192(%rcx), %ymm2, %ymm0 11040; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12 11041; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm7 11042; AVX512BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 11043; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] 11044; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 11045; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 11046; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 11047; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11048; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm3 11049; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 11050; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm2 11051; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 11052; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm6 11053; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 11054; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 11055; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 11056; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 11057; AVX512BW-NEXT: vpermt2d %zmm10, %zmm31, %zmm16 11058; AVX512BW-NEXT: movb $-110, %al 11059; AVX512BW-NEXT: kmovd %eax, %k2 11060; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} 11061; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 11062; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 11063; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 11064; AVX512BW-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 11065; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} 11066; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm23 11067; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15] 11068; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11069; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 11070; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 11071; AVX512BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 11072; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11073; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} 11074; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm10 11075; AVX512BW-NEXT: vpermt2d %zmm8, %zmm31, %zmm10 11076; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm10 {%k2} 11077; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm25 11078; AVX512BW-NEXT: vpermt2d %zmm8, %zmm5, %zmm25 11079; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm25 {%k2} 11080; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm26 11081; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15] 11082; AVX512BW-NEXT: vpermt2d %zmm8, %zmm11, %zmm26 11083; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11084; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} 11085; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 11086; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 11087; AVX512BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm8 11088; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 11089; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} 11090; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 11091; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm9 11092; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k2} 11093; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 11094; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 11095; AVX512BW-NEXT: vpermt2d %zmm1, %zmm11, %zmm27 11096; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 11097; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} 11098; AVX512BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 11099; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 11100; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} 11101; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 11102; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 11103; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 11104; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 11105; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm31 11106; AVX512BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 11107; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload 11108; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} 11109; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 11110; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 11111; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 11112; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 11113; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 11114; AVX512BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 11115; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 11116; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} 11117; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 11118; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 11119; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 11120; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 11121; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 11122; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 11123; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 11124; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3] 11125; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm1 11126; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 11127; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3] 11128; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1 11129; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 11130; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3] 11131; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm1 11132; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 11133; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] 11134; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 11135; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 11136; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 11137; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 11138; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 11139; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 11140; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] 11141; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 11142; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 11143; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] 11144; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 11145; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7] 11146; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm28 11147; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm2 11148; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15] 11149; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm4 11150; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7] 11151; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm3 11152; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 11153; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm1 11154; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 11155; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 11156; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 11157; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 11158; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 11159; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 11160; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 11161; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 11162; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 11163; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 11164; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 11165; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 11166; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 11167; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 11168; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 11169; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 11170; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 11171; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 11172; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 11173; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 11174; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 11175; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 11176; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 11177; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 11178; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 11179; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 11180; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 11181; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 11182; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 11183; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 11184; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm15 11185; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 11186; AVX512BW-NEXT: vmovdqa64 %zmm15, 1472(%rax) 11187; AVX512BW-NEXT: vmovdqa64 %zmm11, 1408(%rax) 11188; AVX512BW-NEXT: vmovdqa64 %zmm5, 1344(%rax) 11189; AVX512BW-NEXT: vmovdqa64 %zmm31, 1152(%rax) 11190; AVX512BW-NEXT: vmovdqa64 %zmm28, 1088(%rax) 11191; AVX512BW-NEXT: vmovdqa64 %zmm27, 1024(%rax) 11192; AVX512BW-NEXT: vmovdqa64 %zmm9, 960(%rax) 11193; AVX512BW-NEXT: vmovdqa64 %zmm8, 768(%rax) 11194; AVX512BW-NEXT: vmovdqa64 %zmm17, 704(%rax) 11195; AVX512BW-NEXT: vmovdqa64 %zmm26, 640(%rax) 11196; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%rax) 11197; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%rax) 11198; AVX512BW-NEXT: vmovdqa64 %zmm24, 320(%rax) 11199; AVX512BW-NEXT: vmovdqa64 %zmm23, 256(%rax) 11200; AVX512BW-NEXT: vmovdqa64 %zmm20, 192(%rax) 11201; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rax) 11202; AVX512BW-NEXT: vmovdqa64 %zmm21, 1280(%rax) 11203; AVX512BW-NEXT: vmovdqa64 %zmm29, 1216(%rax) 11204; AVX512BW-NEXT: vmovdqa64 %zmm18, 896(%rax) 11205; AVX512BW-NEXT: vmovdqa64 %zmm14, 832(%rax) 11206; AVX512BW-NEXT: vmovdqa64 %zmm22, 512(%rax) 11207; AVX512BW-NEXT: vmovdqa64 %zmm13, 448(%rax) 11208; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rax) 11209; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rax) 11210; AVX512BW-NEXT: addq $456, %rsp # imm = 0x1C8 11211; AVX512BW-NEXT: vzeroupper 11212; AVX512BW-NEXT: retq 11213; 11214; AVX512BW-FCP-LABEL: store_i32_stride6_vf64: 11215; AVX512BW-FCP: # %bb.0: 11216; AVX512BW-FCP-NEXT: subq $1160, %rsp # imm = 0x488 11217; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 11218; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 11219; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 11220; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 11221; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 11222; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 11223; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 11224; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 11225; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 11226; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 11227; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 11228; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 11229; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 11230; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 11231; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11232; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 11233; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 11234; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 11235; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 11236; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 11237; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] 11238; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 11239; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 11240; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 11241; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 11242; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 11243; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] 11244; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11245; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 11246; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 11247; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 11248; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 11249; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 11250; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 11251; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 11252; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 11253; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 11254; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 11255; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 11256; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 11257; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 11258; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11259; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] 11260; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11261; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 11262; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 11263; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 11264; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11265; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 11266; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 11267; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11268; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 11269; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 11270; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11271; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 11272; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 11273; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11274; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 11275; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 11276; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11277; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 11278; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11279; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 11280; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 11281; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm7, %zmm1 11282; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11283; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 11284; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 11285; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11286; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 11287; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 11288; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11289; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 11290; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 11291; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 11292; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 11293; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 11294; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11295; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 11296; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 11297; AVX512BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 11298; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11299; AVX512BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 11300; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11301; AVX512BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 11302; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11303; AVX512BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 11304; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11305; AVX512BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 11306; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11307; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm30, %zmm2 11308; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11309; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 11310; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] 11311; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 11312; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 11313; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 11314; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 11315; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] 11316; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 11317; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 11318; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] 11319; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 11320; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 11321; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 11322; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 11323; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm5 11324; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm21 11325; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 11326; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 11327; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 11328; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm14 11329; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 11330; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 11331; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm30 11332; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 11333; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 11334; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 11335; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 11336; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm6 11337; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 11338; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm13 11339; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 11340; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 11341; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 11342; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 11343; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 11344; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 11345; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 11346; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 11347; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 11348; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 11349; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 11350; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 11351; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm1 11352; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 11353; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 11354; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm24 11355; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 11356; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 11357; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 11358; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 11359; AVX512BW-FCP-NEXT: movb $-110, %al 11360; AVX512BW-FCP-NEXT: kmovd %eax, %k2 11361; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11362; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} 11363; AVX512BW-FCP-NEXT: movb $36, %al 11364; AVX512BW-FCP-NEXT: kmovd %eax, %k1 11365; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} 11366; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} 11367; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 {%k2} 11368; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload 11369; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} 11370; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} 11371; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 11372; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 11373; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 11374; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill 11375; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 11376; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 11377; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11378; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 11379; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 11380; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 11381; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 11382; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11383; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 11384; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm19 11385; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11386; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 11387; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 11388; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 11389; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} 11390; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload 11391; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} 11392; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11393; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} 11394; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 11395; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} 11396; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 11397; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} 11398; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11399; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} 11400; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 11401; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 11402; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 11403; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 11404; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 11405; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 11406; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm10 11407; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 11408; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 11409; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 11410; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 11411; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} 11412; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload 11413; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} 11414; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11415; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 11416; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm26 {%k2} 11417; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 {%k1} 11418; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11419; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} 11420; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 11421; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 11422; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 11423; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 11424; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 11425; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm28 11426; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm13 11427; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 11428; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} 11429; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm0 11430; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 11431; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 11432; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 11433; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} 11434; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm2 11435; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 11436; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 11437; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} 11438; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm29 11439; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 11440; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} 11441; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 11442; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 11443; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 11444; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 11445; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload 11446; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 11447; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 11448; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} 11449; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 11450; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 11451; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 11452; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 11453; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 11454; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 11455; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 11456; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 11457; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} 11458; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 11459; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload 11460; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 11461; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 11462; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 11463; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 11464; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 11465; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 11466; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 11467; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 11468; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 11469; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm22 11470; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 11471; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm19 11472; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm17 11473; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 11474; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm2 11475; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm16 11476; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm15 11477; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 11478; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm26 11479; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm28 11480; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 11481; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm2 11482; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm9 11483; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm10 11484; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm29 11485; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm12 11486; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 11487; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 11488; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 11489; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 1472(%rax) 11490; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 1408(%rax) 11491; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 1344(%rax) 11492; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 1280(%rax) 11493; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 1216(%rax) 11494; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 1152(%rax) 11495; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 1088(%rax) 11496; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) 11497; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 960(%rax) 11498; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 896(%rax) 11499; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 832(%rax) 11500; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 768(%rax) 11501; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 704(%rax) 11502; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 640(%rax) 11503; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 576(%rax) 11504; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 512(%rax) 11505; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 448(%rax) 11506; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 384(%rax) 11507; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) 11508; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 256(%rax) 11509; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 192(%rax) 11510; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 128(%rax) 11511; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) 11512; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rax) 11513; AVX512BW-FCP-NEXT: addq $1160, %rsp # imm = 0x488 11514; AVX512BW-FCP-NEXT: vzeroupper 11515; AVX512BW-FCP-NEXT: retq 11516; 11517; AVX512DQ-BW-LABEL: store_i32_stride6_vf64: 11518; AVX512DQ-BW: # %bb.0: 11519; AVX512DQ-BW-NEXT: subq $456, %rsp # imm = 0x1C8 11520; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm9 11521; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm30 11522; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm11 11523; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm10 11524; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm8 11525; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm4 11526; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm15 11527; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm28 11528; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm17 11529; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm24 11530; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm22 11531; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm18 11532; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm16 11533; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm13 11534; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 11535; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] 11536; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 11537; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 11538; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 11539; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm14 11540; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 11541; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 11542; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm20 11543; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 11544; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 11545; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 11546; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm23 11547; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm23 11548; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] 11549; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm5 11550; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 11551; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11552; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm25 11553; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 11554; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm26 11555; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm6, %zmm26 11556; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm5 11557; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 11558; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11559; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm5 11560; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 11561; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11562; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 11563; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm27 11564; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm6, %zmm27 11565; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 11566; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 11567; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11568; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 11569; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 11570; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm19 11571; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 11572; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 11573; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] 11574; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 11575; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm22 11576; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 11577; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm17 11578; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm18 11579; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm21, %zmm18 11580; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm28 11581; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %ymm1 11582; AVX512DQ-BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm5 11583; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11584; AVX512DQ-BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm6 11585; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill 11586; AVX512DQ-BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm2 11587; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11588; AVX512DQ-BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 11589; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 11590; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %ymm0 11591; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] 11592; AVX512DQ-BW-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 11593; AVX512DQ-BW-NEXT: movb $36, %al 11594; AVX512DQ-BW-NEXT: kmovd %eax, %k1 11595; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7] 11596; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm13 11597; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm29, %zmm13 11598; AVX512DQ-BW-NEXT: vpermt2d 64(%rcx), %ymm2, %ymm1 11599; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7] 11600; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm14 11601; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 11602; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm29, %zmm14 11603; AVX512DQ-BW-NEXT: vpermt2d 128(%rcx), %ymm2, %ymm0 11604; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7] 11605; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %ymm0 11606; AVX512DQ-BW-NEXT: vpermt2d 192(%rcx), %ymm2, %ymm0 11607; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm12 11608; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm7 11609; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 11610; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] 11611; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm4 11612; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 11613; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 11614; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11615; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm3 11616; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 11617; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm2 11618; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 11619; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm6 11620; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 11621; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm16 11622; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 11623; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] 11624; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm31, %zmm16 11625; AVX512DQ-BW-NEXT: movb $-110, %al 11626; AVX512DQ-BW-NEXT: kmovd %eax, %k2 11627; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} 11628; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm20 11629; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 11630; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 11631; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 11632; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} 11633; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm23 11634; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15] 11635; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11636; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 11637; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 11638; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 11639; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11640; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} 11641; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm10 11642; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm31, %zmm10 11643; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm10 {%k2} 11644; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm25 11645; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm5, %zmm25 11646; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm25 {%k2} 11647; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm26 11648; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15] 11649; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm11, %zmm26 11650; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11651; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} 11652; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 11653; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm8 11654; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm8 11655; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 11656; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} 11657; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 11658; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm9 11659; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k2} 11660; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 11661; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 11662; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm11, %zmm27 11663; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 11664; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} 11665; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 11666; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 11667; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} 11668; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 11669; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 11670; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 11671; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 11672; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm31 11673; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 11674; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload 11675; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} 11676; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 11677; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 11678; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 11679; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 11680; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 11681; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 11682; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 11683; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} 11684; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 11685; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 11686; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 11687; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 11688; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 11689; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 11690; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 11691; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3] 11692; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm1 11693; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 11694; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3] 11695; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm1 11696; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 11697; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3] 11698; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm1 11699; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] 11700; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] 11701; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 11702; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 11703; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 11704; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 11705; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 11706; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 11707; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] 11708; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 11709; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 11710; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] 11711; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 11712; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7] 11713; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm28 11714; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm2 11715; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15] 11716; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm4 11717; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7] 11718; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm3 11719; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 11720; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm1 11721; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 11722; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 11723; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 11724; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 11725; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 11726; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 11727; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 11728; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 11729; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 11730; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 11731; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 11732; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 11733; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 11734; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 11735; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 11736; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 11737; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 11738; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 11739; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 11740; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 11741; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 11742; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 11743; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 11744; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 11745; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 11746; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 11747; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 11748; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 11749; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 11750; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 11751; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm15 11752; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 11753; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 1472(%rax) 11754; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 1408(%rax) 11755; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 1344(%rax) 11756; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 1152(%rax) 11757; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 1088(%rax) 11758; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 1024(%rax) 11759; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 960(%rax) 11760; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 768(%rax) 11761; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 704(%rax) 11762; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 640(%rax) 11763; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 576(%rax) 11764; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 384(%rax) 11765; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 320(%rax) 11766; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 256(%rax) 11767; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 192(%rax) 11768; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rax) 11769; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 1280(%rax) 11770; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 1216(%rax) 11771; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 896(%rax) 11772; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 832(%rax) 11773; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 512(%rax) 11774; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 448(%rax) 11775; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 128(%rax) 11776; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 64(%rax) 11777; AVX512DQ-BW-NEXT: addq $456, %rsp # imm = 0x1C8 11778; AVX512DQ-BW-NEXT: vzeroupper 11779; AVX512DQ-BW-NEXT: retq 11780; 11781; AVX512DQ-BW-FCP-LABEL: store_i32_stride6_vf64: 11782; AVX512DQ-BW-FCP: # %bb.0: 11783; AVX512DQ-BW-FCP-NEXT: subq $1160, %rsp # imm = 0x488 11784; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 11785; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 11786; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 11787; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 11788; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 11789; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 11790; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 11791; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 11792; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 11793; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 11794; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] 11795; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 11796; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 11797; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 11798; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11799; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] 11800; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 11801; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 11802; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 11803; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 11804; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] 11805; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] 11806; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 11807; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 11808; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 11809; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 11810; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] 11811; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11812; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 11813; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 11814; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 11815; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 11816; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] 11817; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 11818; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 11819; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 11820; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 11821; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] 11822; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 11823; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 11824; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 11825; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11826; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] 11827; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 11828; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 11829; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 11830; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 11831; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11832; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 11833; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 11834; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11835; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 11836; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 11837; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11838; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 11839; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 11840; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11841; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 11842; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 11843; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11844; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 11845; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11846; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 11847; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 11848; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm7, %zmm1 11849; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11850; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 11851; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 11852; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11853; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 11854; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 11855; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11856; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 11857; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 11858; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 11859; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 11860; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 11861; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11862; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 11863; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 11864; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 11865; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11866; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 11867; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11868; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 11869; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11870; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 11871; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11872; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 11873; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11874; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm30, %zmm2 11875; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11876; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] 11877; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] 11878; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 11879; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 11880; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 11881; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] 11882; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] 11883; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 11884; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 11885; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] 11886; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 11887; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 11888; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] 11889; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 11890; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm5 11891; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm21 11892; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 11893; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 11894; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 11895; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm14 11896; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 11897; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 11898; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm30 11899; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 11900; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 11901; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 11902; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 11903; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm6 11904; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 11905; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm13 11906; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 11907; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 11908; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 11909; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 11910; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 11911; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 11912; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 11913; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 11914; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 11915; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 11916; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 11917; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 11918; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm1 11919; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 11920; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 11921; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm24 11922; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 11923; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 11924; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 11925; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 11926; AVX512DQ-BW-FCP-NEXT: movb $-110, %al 11927; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 11928; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11929; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} 11930; AVX512DQ-BW-FCP-NEXT: movb $36, %al 11931; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 11932; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} 11933; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} 11934; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 {%k2} 11935; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload 11936; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} 11937; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} 11938; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 11939; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] 11940; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 11941; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill 11942; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] 11943; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 11944; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11945; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] 11946; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 11947; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] 11948; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 11949; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11950; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] 11951; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm19 11952; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11953; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] 11954; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 11955; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload 11956; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} 11957; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload 11958; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} 11959; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11960; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} 11961; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 11962; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} 11963; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 11964; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} 11965; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11966; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} 11967; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 11968; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 11969; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 11970; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 11971; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 11972; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 11973; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm10 11974; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 11975; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 11976; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 11977; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 11978; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} 11979; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload 11980; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} 11981; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11982; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} 11983; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm26 {%k2} 11984; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 {%k1} 11985; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11986; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} 11987; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 11988; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 11989; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 11990; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 11991; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 11992; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm28 11993; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm13 11994; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 11995; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} 11996; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm0 11997; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 11998; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 11999; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 12000; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} 12001; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm2 12002; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 12003; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 12004; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} 12005; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm29 12006; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 12007; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} 12008; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 12009; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 12010; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 12011; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] 12012; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload 12013; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 12014; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 12015; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} 12016; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] 12017; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 12018; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 12019; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 12020; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 12021; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] 12022; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 12023; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 12024; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} 12025; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] 12026; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload 12027; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 12028; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 12029; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] 12030; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload 12031; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 12032; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] 12033; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 12034; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 12035; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 12036; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm22 12037; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 12038; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm19 12039; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm17 12040; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 12041; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm2 12042; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm16 12043; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm15 12044; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 12045; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm26 12046; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm28 12047; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 12048; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm2 12049; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm9 12050; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm10 12051; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm29 12052; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm12 12053; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 12054; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 12055; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 12056; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 1472(%rax) 12057; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 1408(%rax) 12058; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 1344(%rax) 12059; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 1280(%rax) 12060; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 1216(%rax) 12061; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 1152(%rax) 12062; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 1088(%rax) 12063; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) 12064; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 960(%rax) 12065; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 896(%rax) 12066; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 832(%rax) 12067; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 768(%rax) 12068; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 704(%rax) 12069; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 640(%rax) 12070; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 576(%rax) 12071; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 512(%rax) 12072; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 448(%rax) 12073; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 384(%rax) 12074; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) 12075; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 256(%rax) 12076; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 192(%rax) 12077; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 128(%rax) 12078; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) 12079; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%rax) 12080; AVX512DQ-BW-FCP-NEXT: addq $1160, %rsp # imm = 0x488 12081; AVX512DQ-BW-FCP-NEXT: vzeroupper 12082; AVX512DQ-BW-FCP-NEXT: retq 12083 %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 12084 %in.vec1 = load <64 x i32>, ptr %in.vecptr1, align 64 12085 %in.vec2 = load <64 x i32>, ptr %in.vecptr2, align 64 12086 %in.vec3 = load <64 x i32>, ptr %in.vecptr3, align 64 12087 %in.vec4 = load <64 x i32>, ptr %in.vecptr4, align 64 12088 %in.vec5 = load <64 x i32>, ptr %in.vecptr5, align 64 12089 %1 = shufflevector <64 x i32> %in.vec0, <64 x i32> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 12090 %2 = shufflevector <64 x i32> %in.vec2, <64 x i32> %in.vec3, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 12091 %3 = shufflevector <64 x i32> %in.vec4, <64 x i32> %in.vec5, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 12092 %4 = shufflevector <128 x i32> %1, <128 x i32> %2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> 12093 %5 = shufflevector <128 x i32> %3, <128 x i32> poison, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 12094 %6 = shufflevector <256 x i32> %4, <256 x i32> %5, <384 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255, i32 256, i32 257, i32 258, i32 259, i32 260, i32 261, i32 262, i32 263, i32 264, i32 265, i32 266, i32 267, i32 268, i32 269, i32 270, i32 271, i32 272, i32 273, i32 274, i32 275, i32 276, i32 277, i32 278, i32 279, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 304, i32 305, i32 306, i32 307, i32 308, i32 309, i32 310, i32 311, i32 312, i32 313, i32 314, i32 315, i32 316, i32 317, i32 318, i32 319, i32 320, i32 321, i32 322, i32 323, i32 324, i32 325, i32 326, i32 327, i32 328, i32 329, i32 330, i32 331, i32 332, i32 333, i32 334, i32 335, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 359, i32 360, i32 361, i32 362, i32 363, i32 364, i32 365, i32 366, i32 367, i32 368, i32 369, i32 370, i32 371, i32 372, i32 373, i32 374, i32 375, i32 376, i32 377, i32 378, i32 379, i32 380, i32 381, i32 382, i32 383> 12095 %interleaved.vec = shufflevector <384 x i32> %6, <384 x i32> poison, <384 x i32> <i32 0, i32 64, i32 128, i32 192, i32 256, i32 320, i32 1, i32 65, i32 129, i32 193, i32 257, i32 321, i32 2, i32 66, i32 130, i32 194, i32 258, i32 322, i32 3, i32 67, i32 131, i32 195, i32 259, i32 323, i32 4, i32 68, i32 132, i32 196, i32 260, i32 324, i32 5, i32 69, i32 133, i32 197, i32 261, i32 325, i32 6, i32 70, i32 134, i32 198, i32 262, i32 326, i32 7, i32 71, i32 135, i32 199, i32 263, i32 327, i32 8, i32 72, i32 136, i32 200, i32 264, i32 328, i32 9, i32 73, i32 137, i32 201, i32 265, i32 329, i32 10, i32 74, i32 138, i32 202, i32 266, i32 330, i32 11, i32 75, i32 139, i32 203, i32 267, i32 331, i32 12, i32 76, i32 140, i32 204, i32 268, i32 332, i32 13, i32 77, i32 141, i32 205, i32 269, i32 333, i32 14, i32 78, i32 142, i32 206, i32 270, i32 334, i32 15, i32 79, i32 143, i32 207, i32 271, i32 335, i32 16, i32 80, i32 144, i32 208, i32 272, i32 336, i32 17, i32 81, i32 145, i32 209, i32 273, i32 337, i32 18, i32 82, i32 146, i32 210, i32 274, i32 338, i32 19, i32 83, i32 147, i32 211, i32 275, i32 339, i32 20, i32 84, i32 148, i32 212, i32 276, i32 340, i32 21, i32 85, i32 149, i32 213, i32 277, i32 341, i32 22, i32 86, i32 150, i32 214, i32 278, i32 342, i32 23, i32 87, i32 151, i32 215, i32 279, i32 343, i32 24, i32 88, i32 152, i32 216, i32 280, i32 344, i32 25, i32 89, i32 153, i32 217, i32 281, i32 345, i32 26, i32 90, i32 154, i32 218, i32 282, i32 346, i32 27, i32 91, i32 155, i32 219, i32 283, i32 347, i32 28, i32 92, i32 156, i32 220, i32 284, i32 348, i32 29, i32 93, i32 157, i32 221, i32 285, i32 349, i32 30, i32 94, i32 158, i32 222, i32 286, i32 350, i32 31, i32 95, i32 159, i32 223, i32 287, i32 351, i32 32, i32 96, i32 160, i32 224, i32 288, i32 352, i32 33, i32 97, i32 161, i32 225, i32 289, i32 353, i32 34, i32 98, i32 162, i32 226, i32 290, i32 354, i32 35, i32 99, i32 163, i32 227, i32 291, i32 355, i32 36, i32 100, i32 164, i32 228, i32 292, i32 356, i32 37, i32 101, i32 165, i32 229, i32 293, i32 357, i32 38, i32 102, i32 166, i32 230, i32 294, i32 358, i32 39, i32 103, i32 167, i32 231, i32 295, i32 359, i32 40, i32 104, i32 168, i32 232, i32 296, i32 360, i32 41, i32 105, i32 169, i32 233, i32 297, i32 361, i32 42, i32 106, i32 170, i32 234, i32 298, i32 362, i32 43, i32 107, i32 171, i32 235, i32 299, i32 363, i32 44, i32 108, i32 172, i32 236, i32 300, i32 364, i32 45, i32 109, i32 173, i32 237, i32 301, i32 365, i32 46, i32 110, i32 174, i32 238, i32 302, i32 366, i32 47, i32 111, i32 175, i32 239, i32 303, i32 367, i32 48, i32 112, i32 176, i32 240, i32 304, i32 368, i32 49, i32 113, i32 177, i32 241, i32 305, i32 369, i32 50, i32 114, i32 178, i32 242, i32 306, i32 370, i32 51, i32 115, i32 179, i32 243, i32 307, i32 371, i32 52, i32 116, i32 180, i32 244, i32 308, i32 372, i32 53, i32 117, i32 181, i32 245, i32 309, i32 373, i32 54, i32 118, i32 182, i32 246, i32 310, i32 374, i32 55, i32 119, i32 183, i32 247, i32 311, i32 375, i32 56, i32 120, i32 184, i32 248, i32 312, i32 376, i32 57, i32 121, i32 185, i32 249, i32 313, i32 377, i32 58, i32 122, i32 186, i32 250, i32 314, i32 378, i32 59, i32 123, i32 187, i32 251, i32 315, i32 379, i32 60, i32 124, i32 188, i32 252, i32 316, i32 380, i32 61, i32 125, i32 189, i32 253, i32 317, i32 381, i32 62, i32 126, i32 190, i32 254, i32 318, i32 382, i32 63, i32 127, i32 191, i32 255, i32 319, i32 383> 12096 store <384 x i32> %interleaved.vec, ptr %out.vec, align 64 12097 ret void 12098} 12099