1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved stores. 17 18define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { 19; SSE-LABEL: store_i16_stride6_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 22; SSE-NEXT: movdqa (%rdi), %xmm0 23; SSE-NEXT: movdqa (%rdx), %xmm1 24; SSE-NEXT: movdqa (%r8), %xmm2 25; SSE-NEXT: movdqa (%r9), %xmm3 26; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 27; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 28; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 29; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 30; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,4,6] 31; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 32; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 33; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[3,3] 34; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] 35; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] 36; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 37; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,3,3,4,5,6,7] 38; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 39; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] 40; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 41; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 42; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,0,3,4,5,6,7] 43; SSE-NEXT: movaps %xmm0, (%rax) 44; SSE-NEXT: movq %xmm1, 16(%rax) 45; SSE-NEXT: retq 46; 47; AVX-LABEL: store_i16_stride6_vf2: 48; AVX: # %bb.0: 49; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 50; AVX-NEXT: vmovdqa (%rdi), %xmm0 51; AVX-NEXT: vmovdqa (%rdx), %xmm1 52; AVX-NEXT: vmovdqa (%r8), %xmm2 53; AVX-NEXT: vmovdqa (%r9), %xmm3 54; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 55; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 56; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 57; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 58; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,5,7,6,7] 59; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] 60; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] 61; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 62; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7] 63; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 64; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 65; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7] 66; AVX-NEXT: vmovdqa %xmm0, (%rax) 67; AVX-NEXT: vmovq %xmm1, 16(%rax) 68; AVX-NEXT: retq 69; 70; AVX2-LABEL: store_i16_stride6_vf2: 71; AVX2: # %bb.0: 72; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 73; AVX2-NEXT: vmovdqa (%rdi), %xmm0 74; AVX2-NEXT: vmovdqa (%rdx), %xmm1 75; AVX2-NEXT: vmovdqa (%r8), %xmm2 76; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 77; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 78; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 79; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] 80; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 81; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] 82; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 83; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] 84; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] 85; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] 86; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 87; AVX2-NEXT: vmovq %xmm1, 16(%rax) 88; AVX2-NEXT: vmovdqa %xmm0, (%rax) 89; AVX2-NEXT: vzeroupper 90; AVX2-NEXT: retq 91; 92; AVX2-FP-LABEL: store_i16_stride6_vf2: 93; AVX2-FP: # %bb.0: 94; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 95; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 96; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 97; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 98; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 99; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 100; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 101; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] 102; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 103; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,2,3,6,7,18,19,22,23,2,3,6,7,u,u,u,u,u,u,u,u] 104; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm2 105; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 106; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 107; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6,7] 108; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 109; AVX2-FP-NEXT: vmovq %xmm1, 16(%rax) 110; AVX2-FP-NEXT: vmovdqa %xmm0, (%rax) 111; AVX2-FP-NEXT: vzeroupper 112; AVX2-FP-NEXT: retq 113; 114; AVX2-FCP-LABEL: store_i16_stride6_vf2: 115; AVX2-FCP: # %bb.0: 116; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 117; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 118; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 119; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 120; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 121; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 122; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 123; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] 124; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 125; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,2,3,6,7,18,19,22,23,2,3,6,7,u,u,u,u,u,u,u,u] 126; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm2 127; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 128; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 129; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6,7] 130; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 131; AVX2-FCP-NEXT: vmovq %xmm1, 16(%rax) 132; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax) 133; AVX2-FCP-NEXT: vzeroupper 134; AVX2-FCP-NEXT: retq 135; 136; AVX512-LABEL: store_i16_stride6_vf2: 137; AVX512: # %bb.0: 138; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 139; AVX512-NEXT: vmovdqa (%rdi), %xmm0 140; AVX512-NEXT: vmovdqa (%rdx), %xmm1 141; AVX512-NEXT: vmovdqa (%r8), %xmm2 142; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 143; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 144; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 145; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] 146; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 147; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] 148; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 149; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] 150; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] 151; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] 152; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 153; AVX512-NEXT: vmovq %xmm1, 16(%rax) 154; AVX512-NEXT: vmovdqa %xmm0, (%rax) 155; AVX512-NEXT: vzeroupper 156; AVX512-NEXT: retq 157; 158; AVX512-FCP-LABEL: store_i16_stride6_vf2: 159; AVX512-FCP: # %bb.0: 160; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 161; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 162; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 163; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 164; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 165; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 166; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 167; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] 168; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 169; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,2,3,6,7,18,19,22,23,2,3,6,7,u,u,u,u,u,u,u,u] 170; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm2 171; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 172; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 173; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6,7] 174; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 175; AVX512-FCP-NEXT: vmovq %xmm1, 16(%rax) 176; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax) 177; AVX512-FCP-NEXT: vzeroupper 178; AVX512-FCP-NEXT: retq 179; 180; AVX512DQ-LABEL: store_i16_stride6_vf2: 181; AVX512DQ: # %bb.0: 182; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 183; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 184; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 185; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 186; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 187; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 188; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 189; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] 190; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 191; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] 192; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 193; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] 194; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] 195; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] 196; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 197; AVX512DQ-NEXT: vmovq %xmm1, 16(%rax) 198; AVX512DQ-NEXT: vmovdqa %xmm0, (%rax) 199; AVX512DQ-NEXT: vzeroupper 200; AVX512DQ-NEXT: retq 201; 202; AVX512DQ-FCP-LABEL: store_i16_stride6_vf2: 203; AVX512DQ-FCP: # %bb.0: 204; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 205; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 206; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 207; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 208; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 209; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 210; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 211; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] 212; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 213; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,2,3,6,7,18,19,22,23,2,3,6,7,u,u,u,u,u,u,u,u] 214; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm2 215; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 216; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 217; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6,7] 218; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 219; AVX512DQ-FCP-NEXT: vmovq %xmm1, 16(%rax) 220; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax) 221; AVX512DQ-FCP-NEXT: vzeroupper 222; AVX512DQ-FCP-NEXT: retq 223; 224; AVX512BW-LABEL: store_i16_stride6_vf2: 225; AVX512BW: # %bb.0: 226; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 227; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 228; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 229; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 230; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 231; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 232; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 233; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] 234; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 235; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] 236; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 237; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 238; AVX512BW-NEXT: vmovq %xmm1, 16(%rax) 239; AVX512BW-NEXT: vmovdqa %xmm0, (%rax) 240; AVX512BW-NEXT: vzeroupper 241; AVX512BW-NEXT: retq 242; 243; AVX512BW-FCP-LABEL: store_i16_stride6_vf2: 244; AVX512BW-FCP: # %bb.0: 245; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 246; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 247; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 248; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 249; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 250; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 251; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 252; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] 253; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 254; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] 255; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 256; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 257; AVX512BW-FCP-NEXT: vmovq %xmm1, 16(%rax) 258; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rax) 259; AVX512BW-FCP-NEXT: vzeroupper 260; AVX512BW-FCP-NEXT: retq 261; 262; AVX512DQ-BW-LABEL: store_i16_stride6_vf2: 263; AVX512DQ-BW: # %bb.0: 264; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 265; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 266; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 267; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 268; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 269; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 270; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 271; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] 272; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 273; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] 274; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 275; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 276; AVX512DQ-BW-NEXT: vmovq %xmm1, 16(%rax) 277; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rax) 278; AVX512DQ-BW-NEXT: vzeroupper 279; AVX512DQ-BW-NEXT: retq 280; 281; AVX512DQ-BW-FCP-LABEL: store_i16_stride6_vf2: 282; AVX512DQ-BW-FCP: # %bb.0: 283; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 284; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 285; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 286; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 287; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 288; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 289; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 290; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] 291; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 292; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] 293; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 294; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 295; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 16(%rax) 296; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rax) 297; AVX512DQ-BW-FCP-NEXT: vzeroupper 298; AVX512DQ-BW-FCP-NEXT: retq 299 %in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64 300 %in.vec1 = load <2 x i16>, ptr %in.vecptr1, align 64 301 %in.vec2 = load <2 x i16>, ptr %in.vecptr2, align 64 302 %in.vec3 = load <2 x i16>, ptr %in.vecptr3, align 64 303 %in.vec4 = load <2 x i16>, ptr %in.vecptr4, align 64 304 %in.vec5 = load <2 x i16>, ptr %in.vecptr5, align 64 305 %1 = shufflevector <2 x i16> %in.vec0, <2 x i16> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 306 %2 = shufflevector <2 x i16> %in.vec2, <2 x i16> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 307 %3 = shufflevector <2 x i16> %in.vec4, <2 x i16> %in.vec5, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 308 %4 = shufflevector <4 x i16> %1, <4 x i16> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 309 %5 = shufflevector <4 x i16> %3, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 310 %6 = shufflevector <8 x i16> %4, <8 x i16> %5, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 311 %interleaved.vec = shufflevector <12 x i16> %6, <12 x i16> poison, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11> 312 store <12 x i16> %interleaved.vec, ptr %out.vec, align 64 313 ret void 314} 315 316define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { 317; SSE-LABEL: store_i16_stride6_vf4: 318; SSE: # %bb.0: 319; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 320; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 321; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 322; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 323; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 324; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 325; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 326; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 327; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero 328; SSE-NEXT: movdqa %xmm3, %xmm4 329; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] 330; SSE-NEXT: movdqa %xmm1, %xmm5 331; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] 332; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,2,0] 333; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] 334; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm0[1,3] 335; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm6[0,2] 336; SSE-NEXT: movdqa %xmm0, %xmm6 337; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1] 338; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 339; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] 340; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[0,2] 341; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[3,3] 342; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,1,1,3] 343; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] 344; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] 345; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] 346; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] 347; SSE-NEXT: movaps %xmm3, 16(%rax) 348; SSE-NEXT: movaps %xmm5, (%rax) 349; SSE-NEXT: movaps %xmm0, 32(%rax) 350; SSE-NEXT: retq 351; 352; AVX-LABEL: store_i16_stride6_vf4: 353; AVX: # %bb.0: 354; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 355; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 356; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 357; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 358; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 359; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 360; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm2[0],xmm1[0] 361; AVX-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 362; AVX-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero 363; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm5[0],xmm4[0] 364; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13] 365; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,1,3] 366; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] 367; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] 368; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 369; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7] 370; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 371; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] 372; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11] 373; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] 374; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,2,0] 375; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] 376; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] 377; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 378; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] 379; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] 380; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 381; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] 382; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 383; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] 384; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] 385; AVX-NEXT: vmovdqa %xmm0, 32(%rax) 386; AVX-NEXT: vmovaps %ymm1, (%rax) 387; AVX-NEXT: vzeroupper 388; AVX-NEXT: retq 389; 390; AVX2-LABEL: store_i16_stride6_vf4: 391; AVX2: # %bb.0: 392; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 393; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 394; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 395; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 396; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 397; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 398; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 399; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 400; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 401; AVX2-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 402; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0] 403; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13] 404; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm7 405; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 406; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm2 407; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3,4,5],ymm2[6],ymm7[7] 408; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 409; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3 410; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 411; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 412; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 413; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 414; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] 415; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 416; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] 417; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 418; AVX2-NEXT: vmovdqa %xmm0, 32(%rax) 419; AVX2-NEXT: vmovdqa %ymm2, (%rax) 420; AVX2-NEXT: vzeroupper 421; AVX2-NEXT: retq 422; 423; AVX2-FP-LABEL: store_i16_stride6_vf4: 424; AVX2-FP: # %bb.0: 425; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 426; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 427; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 428; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 429; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 430; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 431; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 432; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 433; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 434; AVX2-FP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 435; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0] 436; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13] 437; AVX2-FP-NEXT: vpshufb %ymm6, %ymm2, %ymm7 438; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 439; AVX2-FP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 440; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3,4,5],ymm2[6],ymm7[7] 441; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 442; AVX2-FP-NEXT: vpbroadcastq %xmm3, %ymm3 443; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 444; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] 445; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] 446; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 447; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] 448; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 449; AVX2-FP-NEXT: vmovdqa %xmm0, 32(%rax) 450; AVX2-FP-NEXT: vmovdqa %ymm2, (%rax) 451; AVX2-FP-NEXT: vzeroupper 452; AVX2-FP-NEXT: retq 453; 454; AVX2-FCP-LABEL: store_i16_stride6_vf4: 455; AVX2-FCP: # %bb.0: 456; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 457; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 458; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 459; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 460; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 461; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 462; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 463; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 464; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 465; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 466; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0] 467; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,4,5,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,24,25,28,29,4,5,12,13] 468; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm7 469; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,6,1,3,4,6,1,3] 470; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] 471; AVX2-FCP-NEXT: vpermd %ymm2, %ymm8, %ymm2 472; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 473; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3,4,5],ymm2[6],ymm7[7] 474; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 475; AVX2-FCP-NEXT: vpbroadcastq %xmm3, %ymm3 476; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 477; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] 478; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] 479; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 480; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] 481; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 482; AVX2-FCP-NEXT: vmovdqa %xmm0, 32(%rax) 483; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rax) 484; AVX2-FCP-NEXT: vzeroupper 485; AVX2-FCP-NEXT: retq 486; 487; AVX512-LABEL: store_i16_stride6_vf4: 488; AVX512: # %bb.0: 489; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 490; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 491; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 492; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 493; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 494; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 495; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 496; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 497; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 498; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 499; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 500; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13] 501; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm5 502; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] 503; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 504; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4,5],ymm3[6],ymm5[7] 505; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] 506; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] 507; AVX512-NEXT: vpbroadcastq %xmm4, %ymm4 508; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 509; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 510; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 511; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 512; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] 513; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 514; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] 515; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 516; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 517; AVX512-NEXT: vmovdqa %xmm0, 32(%rax) 518; AVX512-NEXT: vmovdqa %ymm1, (%rax) 519; AVX512-NEXT: vzeroupper 520; AVX512-NEXT: retq 521; 522; AVX512-FCP-LABEL: store_i16_stride6_vf4: 523; AVX512-FCP: # %bb.0: 524; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 525; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 526; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 527; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 528; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 529; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 530; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 531; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 532; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 533; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0] 534; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 535; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,4,5,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,24,25,28,29,4,5,12,13] 536; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm7 537; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,6,1,3,4,6,1,3] 538; AVX512-FCP-NEXT: # ymm8 = mem[0,1,0,1] 539; AVX512-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 540; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 541; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4,5],ymm5[6],ymm7[7] 542; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 543; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] 544; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm5, %ymm3 545; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] 546; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] 547; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 548; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] 549; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 550; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 551; AVX512-FCP-NEXT: vmovdqa %xmm0, 32(%rax) 552; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax) 553; AVX512-FCP-NEXT: vzeroupper 554; AVX512-FCP-NEXT: retq 555; 556; AVX512DQ-LABEL: store_i16_stride6_vf4: 557; AVX512DQ: # %bb.0: 558; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 559; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 560; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 561; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 562; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 563; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 564; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 565; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 566; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 567; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 568; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 569; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13] 570; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm5 571; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] 572; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 573; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4,5],ymm3[6],ymm5[7] 574; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] 575; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] 576; AVX512DQ-NEXT: vpbroadcastq %xmm4, %ymm4 577; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 578; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 579; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 580; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 581; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] 582; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 583; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] 584; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 585; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 586; AVX512DQ-NEXT: vmovdqa %xmm0, 32(%rax) 587; AVX512DQ-NEXT: vmovdqa %ymm1, (%rax) 588; AVX512DQ-NEXT: vzeroupper 589; AVX512DQ-NEXT: retq 590; 591; AVX512DQ-FCP-LABEL: store_i16_stride6_vf4: 592; AVX512DQ-FCP: # %bb.0: 593; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 594; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 595; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 596; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 597; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 598; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 599; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 600; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 601; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 602; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0] 603; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 604; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,4,5,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,24,25,28,29,4,5,12,13] 605; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm7 606; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,6,1,3,4,6,1,3] 607; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] 608; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 609; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 610; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4,5],ymm5[6],ymm7[7] 611; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 612; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] 613; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm5, %ymm3 614; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] 615; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] 616; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 617; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] 618; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 619; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 620; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 32(%rax) 621; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax) 622; AVX512DQ-FCP-NEXT: vzeroupper 623; AVX512DQ-FCP-NEXT: retq 624; 625; AVX512BW-LABEL: store_i16_stride6_vf4: 626; AVX512BW: # %bb.0: 627; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 628; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 629; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 630; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 631; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 632; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 633; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 634; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 635; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 636; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 637; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 638; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 639; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0] 640; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 641; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) 642; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) 643; AVX512BW-NEXT: vzeroupper 644; AVX512BW-NEXT: retq 645; 646; AVX512BW-FCP-LABEL: store_i16_stride6_vf4: 647; AVX512BW-FCP: # %bb.0: 648; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 649; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 650; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 651; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 652; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 653; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 654; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 655; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 656; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 657; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 658; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 659; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 660; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0] 661; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 662; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) 663; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) 664; AVX512BW-FCP-NEXT: vzeroupper 665; AVX512BW-FCP-NEXT: retq 666; 667; AVX512DQ-BW-LABEL: store_i16_stride6_vf4: 668; AVX512DQ-BW: # %bb.0: 669; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 670; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 671; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 672; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 673; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 674; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 675; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 676; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 677; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 678; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 679; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 680; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 681; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0] 682; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 683; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) 684; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax) 685; AVX512DQ-BW-NEXT: vzeroupper 686; AVX512DQ-BW-NEXT: retq 687; 688; AVX512DQ-BW-FCP-LABEL: store_i16_stride6_vf4: 689; AVX512DQ-BW-FCP: # %bb.0: 690; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 691; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 692; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 693; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 694; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 695; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 696; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 697; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 698; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 699; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 700; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 701; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 702; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0] 703; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 704; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) 705; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) 706; AVX512DQ-BW-FCP-NEXT: vzeroupper 707; AVX512DQ-BW-FCP-NEXT: retq 708 %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64 709 %in.vec1 = load <4 x i16>, ptr %in.vecptr1, align 64 710 %in.vec2 = load <4 x i16>, ptr %in.vecptr2, align 64 711 %in.vec3 = load <4 x i16>, ptr %in.vecptr3, align 64 712 %in.vec4 = load <4 x i16>, ptr %in.vecptr4, align 64 713 %in.vec5 = load <4 x i16>, ptr %in.vecptr5, align 64 714 %1 = shufflevector <4 x i16> %in.vec0, <4 x i16> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 715 %2 = shufflevector <4 x i16> %in.vec2, <4 x i16> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 716 %3 = shufflevector <4 x i16> %in.vec4, <4 x i16> %in.vec5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 717 %4 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 718 %5 = shufflevector <8 x i16> %3, <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 719 %6 = shufflevector <16 x i16> %4, <16 x i16> %5, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 720 %interleaved.vec = shufflevector <24 x i16> %6, <24 x i16> poison, <24 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23> 721 store <24 x i16> %interleaved.vec, ptr %out.vec, align 64 722 ret void 723} 724 725define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { 726; SSE-LABEL: store_i16_stride6_vf8: 727; SSE: # %bb.0: 728; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 729; SSE-NEXT: movdqa (%rdi), %xmm0 730; SSE-NEXT: movdqa (%rsi), %xmm8 731; SSE-NEXT: movdqa (%rdx), %xmm1 732; SSE-NEXT: movdqa (%rcx), %xmm9 733; SSE-NEXT: movdqa (%r8), %xmm6 734; SSE-NEXT: movdqa (%r9), %xmm5 735; SSE-NEXT: movdqa %xmm1, %xmm4 736; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] 737; SSE-NEXT: movdqa %xmm0, %xmm7 738; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] 739; SSE-NEXT: movdqa %xmm7, %xmm10 740; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm4[3,3] 741; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,7,7] 742; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm2[2,3] 743; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0,1,3] 744; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,0] 745; SSE-NEXT: andps %xmm2, %xmm10 746; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7] 747; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,1,2,3] 748; SSE-NEXT: movaps %xmm2, %xmm3 749; SSE-NEXT: andnps %xmm11, %xmm3 750; SSE-NEXT: orps %xmm10, %xmm3 751; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] 752; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] 753; SSE-NEXT: movdqa %xmm0, %xmm8 754; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm1[3,3] 755; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[2,1,3,3,4,5,6,7] 756; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm9[0,1] 757; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] 758; SSE-NEXT: andps %xmm2, %xmm8 759; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7] 760; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] 761; SSE-NEXT: andnps %xmm9, %xmm2 762; SSE-NEXT: orps %xmm8, %xmm2 763; SSE-NEXT: movdqa %xmm1, %xmm10 764; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm0[0] 765; SSE-NEXT: movdqa %xmm6, %xmm8 766; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[1,3] 767; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm8[0,2] 768; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,65535,65535] 769; SSE-NEXT: andps %xmm8, %xmm10 770; SSE-NEXT: movdqa %xmm5, %xmm11 771; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] 772; SSE-NEXT: movaps %xmm8, %xmm9 773; SSE-NEXT: andnps %xmm11, %xmm9 774; SSE-NEXT: orps %xmm10, %xmm9 775; SSE-NEXT: movdqa %xmm7, %xmm10 776; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm4[1] 777; SSE-NEXT: movdqa %xmm6, %xmm12 778; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 779; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm4[1,1] 780; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm10[0,2] 781; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] 782; SSE-NEXT: andps %xmm10, %xmm12 783; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,2,3,3] 784; SSE-NEXT: movaps %xmm10, %xmm11 785; SSE-NEXT: andnps %xmm13, %xmm11 786; SSE-NEXT: orps %xmm12, %xmm11 787; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] 788; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm6[1,1,1,1,4,5,6,7] 789; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm7[1,3] 790; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[0,2] 791; SSE-NEXT: andps %xmm8, %xmm4 792; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] 793; SSE-NEXT: pslld $16, %xmm5 794; SSE-NEXT: andnps %xmm5, %xmm8 795; SSE-NEXT: orps %xmm4, %xmm8 796; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 797; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] 798; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[0,2] 799; SSE-NEXT: andps %xmm10, %xmm12 800; SSE-NEXT: andnps %xmm6, %xmm10 801; SSE-NEXT: orps %xmm12, %xmm10 802; SSE-NEXT: movaps %xmm10, 16(%rax) 803; SSE-NEXT: movaps %xmm8, 48(%rax) 804; SSE-NEXT: movaps %xmm11, 64(%rax) 805; SSE-NEXT: movaps %xmm9, (%rax) 806; SSE-NEXT: movaps %xmm2, 32(%rax) 807; SSE-NEXT: movaps %xmm3, 80(%rax) 808; SSE-NEXT: retq 809; 810; AVX-LABEL: store_i16_stride6_vf8: 811; AVX: # %bb.0: 812; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 813; AVX-NEXT: vmovdqa (%rdi), %xmm0 814; AVX-NEXT: vmovdqa (%rsi), %xmm1 815; AVX-NEXT: vmovdqa (%rdx), %xmm2 816; AVX-NEXT: vmovdqa (%rcx), %xmm3 817; AVX-NEXT: vmovdqa (%r8), %xmm4 818; AVX-NEXT: vmovdqa (%r9), %xmm5 819; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 820; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 821; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2] 822; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm6[4,5],xmm8[6,7] 823; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 824; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] 825; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,0,1,1] 826; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,1,0,1] 827; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] 828; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,1,0,1] 829; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7] 830; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 831; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 832; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] 833; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 834; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] 835; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] 836; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 837; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,0,1] 838; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5],xmm1[6,7] 839; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,2,3,3] 840; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1] 841; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] 842; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] 843; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 844; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] 845; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm0[1],xmm4[1] 846; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] 847; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] 848; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] 849; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] 850; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7] 851; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 852; AVX-NEXT: vmovaps %ymm0, 64(%rax) 853; AVX-NEXT: vmovaps %ymm1, 32(%rax) 854; AVX-NEXT: vmovaps %ymm8, (%rax) 855; AVX-NEXT: vzeroupper 856; AVX-NEXT: retq 857; 858; AVX2-LABEL: store_i16_stride6_vf8: 859; AVX2: # %bb.0: 860; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 861; AVX2-NEXT: vmovdqa (%rdi), %xmm0 862; AVX2-NEXT: vmovdqa (%rdx), %xmm1 863; AVX2-NEXT: vmovdqa (%r8), %xmm2 864; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 865; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 866; AVX2-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 867; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] 868; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29] 869; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 870; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2] 871; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 872; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] 873; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] 874; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u] 875; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 876; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,1,3] 877; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27] 878; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm4 879; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,1,3] 880; AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm5 881; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] 882; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] 883; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] 884; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] 885; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] 886; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,10,11,8,9,10,11,4,5,12,13,4,5,12,13,18,19,26,27,22,23,30,31,22,23,30,31,20,21,28,29] 887; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 888; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] 889; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm1 890; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 891; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] 892; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] 893; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 894; AVX2-NEXT: vmovdqa %ymm0, 64(%rax) 895; AVX2-NEXT: vmovdqa %ymm4, 32(%rax) 896; AVX2-NEXT: vmovdqa %ymm3, (%rax) 897; AVX2-NEXT: vzeroupper 898; AVX2-NEXT: retq 899; 900; AVX2-FP-LABEL: store_i16_stride6_vf8: 901; AVX2-FP: # %bb.0: 902; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 903; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 904; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 905; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 906; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 907; AVX2-FP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 908; AVX2-FP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 909; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] 910; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29] 911; AVX2-FP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 912; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2] 913; AVX2-FP-NEXT: vpshufb %ymm4, %ymm5, %ymm4 914; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] 915; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] 916; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u] 917; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 918; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,1,3] 919; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27] 920; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 921; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,1,3] 922; AVX2-FP-NEXT: vpshufb %ymm5, %ymm6, %ymm5 923; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] 924; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] 925; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] 926; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] 927; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] 928; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,10,11,8,9,10,11,4,5,12,13,4,5,12,13,18,19,26,27,22,23,30,31,22,23,30,31,20,21,28,29] 929; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 930; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] 931; AVX2-FP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 932; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 933; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] 934; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] 935; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 936; AVX2-FP-NEXT: vmovdqa %ymm0, 64(%rax) 937; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%rax) 938; AVX2-FP-NEXT: vmovdqa %ymm3, (%rax) 939; AVX2-FP-NEXT: vzeroupper 940; AVX2-FP-NEXT: retq 941; 942; AVX2-FCP-LABEL: store_i16_stride6_vf8: 943; AVX2-FCP: # %bb.0: 944; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 945; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 946; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 947; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 948; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 949; AVX2-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 950; AVX2-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 951; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,1,5,0,4,1,5] 952; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] 953; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 954; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,4,5,4,5,12,13,2,3,10,11,18,19,22,23,24,25,30,31,20,21,28,29,24,25,28,29] 955; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 956; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2] 957; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4 958; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] 959; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,4,0,0,4,4,0] 960; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] 961; AVX2-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm4 962; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,12,13,8,9,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] 963; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 964; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,6,1,5,2,6,1,5] 965; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] 966; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 967; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,6,7,14,15,10,11,14,15,2,3,10,11,16,17,24,25,16,17,20,21,24,25,26,27,18,19,26,27] 968; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 969; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,1,3] 970; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm5 971; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] 972; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] 973; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] 974; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] 975; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,3,3,7,7,3,3,7] 976; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] 977; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 978; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,10,11,8,9,10,11,8,9,12,13,4,5,12,13,18,19,26,27,22,23,18,19,22,23,30,31,20,21,28,29] 979; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 980; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] 981; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 982; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 983; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] 984; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] 985; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 986; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%rax) 987; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%rax) 988; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rax) 989; AVX2-FCP-NEXT: vzeroupper 990; AVX2-FCP-NEXT: retq 991; 992; AVX512-LABEL: store_i16_stride6_vf8: 993; AVX512: # %bb.0: 994; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 995; AVX512-NEXT: vmovdqa (%rdi), %xmm0 996; AVX512-NEXT: vmovdqa (%rdx), %xmm1 997; AVX512-NEXT: vmovdqa (%r8), %xmm2 998; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 999; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1000; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 1001; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3] 1002; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27] 1003; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 1004; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] 1005; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1006; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 1007; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3] 1008; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] 1009; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] 1010; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,0,2] 1011; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29] 1012; AVX512-NEXT: vpshufb %ymm5, %ymm4, %ymm4 1013; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2] 1014; AVX512-NEXT: vpshufb %ymm5, %ymm6, %ymm5 1015; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] 1016; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,0,2] 1017; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u] 1018; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] 1019; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 1020; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] 1021; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,10,11,8,9,10,11,4,5,12,13,4,5,12,13,18,19,26,27,22,23,30,31,22,23,30,31,20,21,28,29] 1022; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 1023; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] 1024; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1 1025; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 1026; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] 1027; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] 1028; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 1029; AVX512-NEXT: vmovdqa %ymm0, 64(%rax) 1030; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) 1031; AVX512-NEXT: vzeroupper 1032; AVX512-NEXT: retq 1033; 1034; AVX512-FCP-LABEL: store_i16_stride6_vf8: 1035; AVX512-FCP: # %bb.0: 1036; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1037; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 1038; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 1039; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 1040; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1041; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1042; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 1043; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5] 1044; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] 1045; AVX512-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 1046; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,10,11,14,15,2,3,10,11,16,17,24,25,16,17,20,21,24,25,26,27,18,19,26,27] 1047; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 1048; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] 1049; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1050; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 1051; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3] 1052; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] 1053; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] 1054; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,1,5,0,4,1,5] 1055; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] 1056; AVX512-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 1057; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,0,1,4,5,4,5,12,13,2,3,10,11,18,19,22,23,24,25,30,31,20,21,28,29,24,25,28,29] 1058; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 1059; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2] 1060; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm5 1061; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] 1062; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,4,0,0,4,4,0] 1063; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] 1064; AVX512-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 1065; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,12,13,8,9,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] 1066; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] 1067; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 1068; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,3,3,7,7,3,3,7] 1069; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] 1070; AVX512-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 1071; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,10,11,8,9,10,11,8,9,12,13,4,5,12,13,18,19,26,27,22,23,18,19,22,23,30,31,20,21,28,29] 1072; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 1073; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] 1074; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 1075; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 1076; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] 1077; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] 1078; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 1079; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax) 1080; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) 1081; AVX512-FCP-NEXT: vzeroupper 1082; AVX512-FCP-NEXT: retq 1083; 1084; AVX512DQ-LABEL: store_i16_stride6_vf8: 1085; AVX512DQ: # %bb.0: 1086; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 1087; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 1088; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 1089; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 1090; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1091; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1092; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 1093; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3] 1094; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27] 1095; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 1096; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] 1097; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1098; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 1099; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3] 1100; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] 1101; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] 1102; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,0,2] 1103; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29] 1104; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm4 1105; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2] 1106; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 1107; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] 1108; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,0,2] 1109; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u] 1110; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] 1111; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 1112; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] 1113; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,10,11,8,9,10,11,4,5,12,13,4,5,12,13,18,19,26,27,22,23,30,31,22,23,30,31,20,21,28,29] 1114; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0 1115; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] 1116; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1 1117; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 1118; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] 1119; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] 1120; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 1121; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax) 1122; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) 1123; AVX512DQ-NEXT: vzeroupper 1124; AVX512DQ-NEXT: retq 1125; 1126; AVX512DQ-FCP-LABEL: store_i16_stride6_vf8: 1127; AVX512DQ-FCP: # %bb.0: 1128; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1129; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 1130; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 1131; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 1132; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1133; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1134; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 1135; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5] 1136; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] 1137; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 1138; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,10,11,14,15,2,3,10,11,16,17,24,25,16,17,20,21,24,25,26,27,18,19,26,27] 1139; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 1140; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] 1141; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1142; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 1143; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3] 1144; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] 1145; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] 1146; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,1,5,0,4,1,5] 1147; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] 1148; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 1149; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,0,1,4,5,4,5,12,13,2,3,10,11,18,19,22,23,24,25,30,31,20,21,28,29,24,25,28,29] 1150; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 1151; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2] 1152; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm5 1153; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] 1154; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,4,0,0,4,4,0] 1155; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] 1156; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 1157; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,12,13,8,9,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] 1158; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] 1159; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 1160; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,3,3,7,7,3,3,7] 1161; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] 1162; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 1163; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,10,11,8,9,10,11,8,9,12,13,4,5,12,13,18,19,26,27,22,23,18,19,22,23,30,31,20,21,28,29] 1164; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 1165; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] 1166; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 1167; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 1168; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] 1169; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] 1170; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 1171; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax) 1172; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) 1173; AVX512DQ-FCP-NEXT: vzeroupper 1174; AVX512DQ-FCP-NEXT: retq 1175; 1176; AVX512BW-LABEL: store_i16_stride6_vf8: 1177; AVX512BW: # %bb.0: 1178; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1179; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 1180; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 1181; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 1182; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1183; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1184; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1185; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 1186; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] 1187; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 1188; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] 1189; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 1190; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) 1191; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rax) 1192; AVX512BW-NEXT: vzeroupper 1193; AVX512BW-NEXT: retq 1194; 1195; AVX512BW-FCP-LABEL: store_i16_stride6_vf8: 1196; AVX512BW-FCP: # %bb.0: 1197; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1198; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 1199; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 1200; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 1201; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1202; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1203; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1204; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 1205; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] 1206; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 1207; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] 1208; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 1209; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) 1210; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax) 1211; AVX512BW-FCP-NEXT: vzeroupper 1212; AVX512BW-FCP-NEXT: retq 1213; 1214; AVX512DQ-BW-LABEL: store_i16_stride6_vf8: 1215; AVX512DQ-BW: # %bb.0: 1216; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1217; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 1218; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 1219; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 1220; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1221; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1222; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1223; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 1224; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] 1225; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 1226; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] 1227; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 1228; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) 1229; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rax) 1230; AVX512DQ-BW-NEXT: vzeroupper 1231; AVX512DQ-BW-NEXT: retq 1232; 1233; AVX512DQ-BW-FCP-LABEL: store_i16_stride6_vf8: 1234; AVX512DQ-BW-FCP: # %bb.0: 1235; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1236; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 1237; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 1238; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 1239; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1240; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1241; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1242; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 1243; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] 1244; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 1245; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] 1246; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 1247; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) 1248; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax) 1249; AVX512DQ-BW-FCP-NEXT: vzeroupper 1250; AVX512DQ-BW-FCP-NEXT: retq 1251 %in.vec0 = load <8 x i16>, ptr %in.vecptr0, align 64 1252 %in.vec1 = load <8 x i16>, ptr %in.vecptr1, align 64 1253 %in.vec2 = load <8 x i16>, ptr %in.vecptr2, align 64 1254 %in.vec3 = load <8 x i16>, ptr %in.vecptr3, align 64 1255 %in.vec4 = load <8 x i16>, ptr %in.vecptr4, align 64 1256 %in.vec5 = load <8 x i16>, ptr %in.vecptr5, align 64 1257 %1 = shufflevector <8 x i16> %in.vec0, <8 x i16> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1258 %2 = shufflevector <8 x i16> %in.vec2, <8 x i16> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1259 %3 = shufflevector <8 x i16> %in.vec4, <8 x i16> %in.vec5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1260 %4 = shufflevector <16 x i16> %1, <16 x i16> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1261 %5 = shufflevector <16 x i16> %3, <16 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1262 %6 = shufflevector <32 x i16> %4, <32 x i16> %5, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 1263 %interleaved.vec = shufflevector <48 x i16> %6, <48 x i16> poison, <48 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 7, i32 15, i32 23, i32 31, i32 39, i32 47> 1264 store <48 x i16> %interleaved.vec, ptr %out.vec, align 64 1265 ret void 1266} 1267 1268define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { 1269; SSE-LABEL: store_i16_stride6_vf16: 1270; SSE: # %bb.0: 1271; SSE-NEXT: subq $24, %rsp 1272; SSE-NEXT: movdqa (%rdi), %xmm15 1273; SSE-NEXT: movdqa 16(%rdi), %xmm11 1274; SSE-NEXT: movdqa (%rsi), %xmm10 1275; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1276; SSE-NEXT: movdqa 16(%rsi), %xmm4 1277; SSE-NEXT: movdqa (%rdx), %xmm14 1278; SSE-NEXT: movdqa 16(%rdx), %xmm12 1279; SSE-NEXT: movdqa (%rcx), %xmm3 1280; SSE-NEXT: movdqa 16(%rcx), %xmm2 1281; SSE-NEXT: movdqa 16(%r8), %xmm0 1282; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1283; SSE-NEXT: movdqa 16(%r9), %xmm8 1284; SSE-NEXT: movdqa %xmm12, %xmm6 1285; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] 1286; SSE-NEXT: movdqa %xmm11, %xmm5 1287; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 1288; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1289; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm6[3,3] 1290; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] 1291; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm0[0,1] 1292; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3] 1293; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,0] 1294; SSE-NEXT: andps %xmm7, %xmm5 1295; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm8[0,2,2,3,4,5,6,7] 1296; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] 1297; SSE-NEXT: movaps %xmm7, %xmm0 1298; SSE-NEXT: andnps %xmm9, %xmm0 1299; SSE-NEXT: orps %xmm5, %xmm0 1300; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 1301; SSE-NEXT: movdqa %xmm14, %xmm5 1302; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 1303; SSE-NEXT: movdqa %xmm15, %xmm13 1304; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] 1305; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1306; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm5[3,3] 1307; SSE-NEXT: movdqa (%r8), %xmm10 1308; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,6,5,7,7] 1309; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,2],xmm9[2,3] 1310; SSE-NEXT: movdqa (%r9), %xmm9 1311; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,6,6,7] 1312; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 1313; SSE-NEXT: movaps %xmm7, %xmm0 1314; SSE-NEXT: andnps %xmm1, %xmm0 1315; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0,1,3] 1316; SSE-NEXT: andps %xmm7, %xmm13 1317; SSE-NEXT: orps %xmm13, %xmm0 1318; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1319; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] 1320; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] 1321; SSE-NEXT: movdqa %xmm11, %xmm1 1322; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm12[3,3] 1323; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 1324; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,6,5,7,7] 1325; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] 1326; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1327; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,6,6,7] 1328; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] 1329; SSE-NEXT: movaps %xmm7, %xmm0 1330; SSE-NEXT: andnps %xmm4, %xmm0 1331; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] 1332; SSE-NEXT: andps %xmm7, %xmm1 1333; SSE-NEXT: orps %xmm1, %xmm0 1334; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1335; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] 1336; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 1337; SSE-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3] 1338; SSE-NEXT: movdqa %xmm15, %xmm1 1339; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm14[3,3] 1340; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[2,1,3,3,4,5,6,7] 1341; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[0,1] 1342; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] 1343; SSE-NEXT: andps %xmm7, %xmm1 1344; SSE-NEXT: movdqa %xmm9, %xmm2 1345; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1346; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[0,2,2,3,4,5,6,7] 1347; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 1348; SSE-NEXT: andnps %xmm3, %xmm7 1349; SSE-NEXT: orps %xmm1, %xmm7 1350; SSE-NEXT: movdqa %xmm12, %xmm3 1351; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm11[0] 1352; SSE-NEXT: movdqa %xmm13, %xmm1 1353; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm11[1,3] 1354; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[0,2] 1355; SSE-NEXT: pslld $16, %xmm8 1356; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] 1357; SSE-NEXT: movdqa %xmm1, %xmm13 1358; SSE-NEXT: pandn %xmm8, %xmm13 1359; SSE-NEXT: andps %xmm1, %xmm3 1360; SSE-NEXT: por %xmm3, %xmm13 1361; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 1362; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] 1363; SSE-NEXT: movdqa %xmm10, %xmm4 1364; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 1365; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm5[1,1] 1366; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[0,2] 1367; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] 1368; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] 1369; SSE-NEXT: movdqa %xmm3, %xmm9 1370; SSE-NEXT: pandn %xmm0, %xmm9 1371; SSE-NEXT: andps %xmm3, %xmm4 1372; SSE-NEXT: por %xmm4, %xmm9 1373; SSE-NEXT: movdqa %xmm15, %xmm0 1374; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm14[1] 1375; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[1,1,1,1,4,5,6,7] 1376; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm14[1,1] 1377; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] 1378; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] 1379; SSE-NEXT: movdqa %xmm3, %xmm8 1380; SSE-NEXT: pandn %xmm0, %xmm8 1381; SSE-NEXT: andps %xmm3, %xmm4 1382; SSE-NEXT: por %xmm4, %xmm8 1383; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm15[0] 1384; SSE-NEXT: movdqa %xmm10, %xmm0 1385; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[1,3] 1386; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[0,2] 1387; SSE-NEXT: movdqa %xmm2, %xmm0 1388; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 1389; SSE-NEXT: movdqa %xmm1, %xmm15 1390; SSE-NEXT: pandn %xmm0, %xmm15 1391; SSE-NEXT: andps %xmm1, %xmm14 1392; SSE-NEXT: por %xmm14, %xmm15 1393; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] 1394; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 1395; SSE-NEXT: movdqa %xmm14, %xmm0 1396; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 1397; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm12[1,1] 1398; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[0,2] 1399; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 1400; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] 1401; SSE-NEXT: movdqa %xmm3, %xmm11 1402; SSE-NEXT: pandn %xmm4, %xmm11 1403; SSE-NEXT: andps %xmm3, %xmm0 1404; SSE-NEXT: por %xmm0, %xmm11 1405; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1406; SSE-NEXT: movaps %xmm2, %xmm0 1407; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] 1408; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[1,1,1,1,4,5,6,7] 1409; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm6[1,1] 1410; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] 1411; SSE-NEXT: andps %xmm3, %xmm4 1412; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,1,1] 1413; SSE-NEXT: pandn %xmm0, %xmm3 1414; SSE-NEXT: por %xmm4, %xmm3 1415; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] 1416; SSE-NEXT: movdqa %xmm14, %xmm0 1417; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1,3] 1418; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[0,2] 1419; SSE-NEXT: movdqa %xmm12, %xmm4 1420; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5] 1421; SSE-NEXT: movdqa %xmm1, %xmm0 1422; SSE-NEXT: pandn %xmm4, %xmm0 1423; SSE-NEXT: andps %xmm1, %xmm6 1424; SSE-NEXT: por %xmm6, %xmm0 1425; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1426; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] 1427; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,1],xmm2[1,3] 1428; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm10[0,2] 1429; SSE-NEXT: andps %xmm1, %xmm5 1430; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1431; SSE-NEXT: pslld $16, %xmm2 1432; SSE-NEXT: pandn %xmm2, %xmm1 1433; SSE-NEXT: por %xmm5, %xmm1 1434; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 1435; SSE-NEXT: movdqa %xmm1, 48(%rax) 1436; SSE-NEXT: movdqa %xmm0, 96(%rax) 1437; SSE-NEXT: movdqa %xmm3, 112(%rax) 1438; SSE-NEXT: movdqa %xmm11, 160(%rax) 1439; SSE-NEXT: movdqa %xmm15, (%rax) 1440; SSE-NEXT: movdqa %xmm8, 16(%rax) 1441; SSE-NEXT: movdqa %xmm9, 64(%rax) 1442; SSE-NEXT: movdqa %xmm13, 144(%rax) 1443; SSE-NEXT: movaps %xmm7, 32(%rax) 1444; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1445; SSE-NEXT: movaps %xmm0, 176(%rax) 1446; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1447; SSE-NEXT: movaps %xmm0, 80(%rax) 1448; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 1449; SSE-NEXT: movaps %xmm0, 128(%rax) 1450; SSE-NEXT: addq $24, %rsp 1451; SSE-NEXT: retq 1452; 1453; AVX-LABEL: store_i16_stride6_vf16: 1454; AVX: # %bb.0: 1455; AVX-NEXT: vmovdqa (%rcx), %xmm3 1456; AVX-NEXT: vmovdqa 16(%rcx), %xmm0 1457; AVX-NEXT: vmovdqa (%rdx), %xmm4 1458; AVX-NEXT: vmovdqa 16(%rdx), %xmm1 1459; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1460; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] 1461; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1462; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 1463; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1464; AVX-NEXT: vmovdqa (%rsi), %xmm5 1465; AVX-NEXT: vmovdqa 16(%rsi), %xmm2 1466; AVX-NEXT: vmovdqa (%rdi), %xmm6 1467; AVX-NEXT: vmovdqa 16(%rdi), %xmm7 1468; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] 1469; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] 1470; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] 1471; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,0,1] 1472; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 1473; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7] 1474; AVX-NEXT: vextractf128 $1, %ymm1, %xmm7 1475; AVX-NEXT: vmovdqa 16(%r8), %xmm10 1476; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4,5],xmm7[6,7] 1477; AVX-NEXT: vmovdqa 16(%r9), %xmm11 1478; AVX-NEXT: vpslld $16, %xmm11, %xmm12 1479; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm12[5],xmm7[6,7] 1480; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1481; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[2,1,3,3,4,5,6,7] 1482; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] 1483; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3] 1484; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7] 1485; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] 1486; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3,4,5,6],xmm7[7] 1487; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1488; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 1489; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1490; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] 1491; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 1492; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 1493; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 1494; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 1495; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] 1496; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] 1497; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] 1498; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1499; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1500; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,6,5,7,7] 1501; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 1502; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 1503; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,6,6,7] 1504; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 1505; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] 1506; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 1507; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 1508; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] 1509; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[2,2,3,3] 1510; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 1511; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,3,2,3] 1512; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 1513; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] 1514; AVX-NEXT: vmovdqa (%r8), %xmm14 1515; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,7,7] 1516; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 1517; AVX-NEXT: vextractf128 $1, %ymm15, %xmm12 1518; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3,4,5],xmm0[6,7] 1519; AVX-NEXT: vmovdqa (%r9), %xmm0 1520; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] 1521; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] 1522; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm2[1],xmm12[2,3,4,5,6],xmm2[7] 1523; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 1524; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0],xmm2[1],xmm15[2,3] 1525; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] 1526; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm2[0,1,2],xmm15[3],xmm2[4,5,6,7] 1527; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,1,1] 1528; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] 1529; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2 1530; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,1,0,1] 1531; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 1532; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3],ymm2[4],ymm8[5,6],ymm2[7] 1533; AVX-NEXT: vinsertps {{.*#+}} xmm8 = xmm2[0,1],xmm10[0],xmm2[3] 1534; AVX-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] 1535; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5],xmm8[6,7] 1536; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 1537; AVX-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero 1538; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5,6,7] 1539; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,0,1,1] 1540; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2],xmm9[3],xmm2[4,5,6,7] 1541; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 1542; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] 1543; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] 1544; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 1545; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 1546; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] 1547; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm5 1548; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7] 1549; AVX-NEXT: vextractf128 $1, %ymm3, %xmm5 1550; AVX-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero 1551; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] 1552; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] 1553; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] 1554; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm14[0],xmm3[3] 1555; AVX-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 1556; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5],xmm3[6,7] 1557; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] 1558; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 1559; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1560; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] 1561; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,1,0,1] 1562; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 1563; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 1564; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,1,3,3,4,5,6,7] 1565; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 1566; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2],xmm2[3] 1567; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 1568; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] 1569; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5,6],xmm4[7] 1570; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1571; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4,5],xmm1[6,7] 1572; AVX-NEXT: vpslld $16, %xmm0, %xmm0 1573; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] 1574; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 1575; AVX-NEXT: vmovdqa %xmm0, 48(%rax) 1576; AVX-NEXT: vmovdqa %xmm2, 32(%rax) 1577; AVX-NEXT: vmovdqa %xmm3, (%rax) 1578; AVX-NEXT: vmovdqa %xmm5, 16(%rax) 1579; AVX-NEXT: vmovdqa %xmm9, 112(%rax) 1580; AVX-NEXT: vmovdqa %xmm8, 96(%rax) 1581; AVX-NEXT: vmovdqa %xmm15, 64(%rax) 1582; AVX-NEXT: vmovdqa %xmm12, 80(%rax) 1583; AVX-NEXT: vmovdqa %xmm7, 176(%rax) 1584; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1585; AVX-NEXT: vmovaps %xmm0, 160(%rax) 1586; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1587; AVX-NEXT: vmovaps %xmm0, 128(%rax) 1588; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1589; AVX-NEXT: vmovaps %xmm0, 144(%rax) 1590; AVX-NEXT: vzeroupper 1591; AVX-NEXT: retq 1592; 1593; AVX2-LABEL: store_i16_stride6_vf16: 1594; AVX2: # %bb.0: 1595; AVX2-NEXT: vmovdqa (%rdi), %ymm1 1596; AVX2-NEXT: vmovdqa (%rsi), %ymm3 1597; AVX2-NEXT: vmovdqa (%rdx), %ymm2 1598; AVX2-NEXT: vmovdqa (%rcx), %ymm4 1599; AVX2-NEXT: vmovdqa (%r8), %ymm13 1600; AVX2-NEXT: vmovdqa (%rcx), %xmm6 1601; AVX2-NEXT: vpsrldq {{.*#+}} xmm5 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 1602; AVX2-NEXT: vmovdqa (%rdx), %xmm7 1603; AVX2-NEXT: vpsrldq {{.*#+}} xmm8 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 1604; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] 1605; AVX2-NEXT: vpbroadcastq %xmm5, %ymm5 1606; AVX2-NEXT: vmovdqa (%rsi), %xmm8 1607; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,1,2,1] 1608; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,7,6,5] 1609; AVX2-NEXT: vmovdqa (%rdi), %xmm9 1610; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,1,2,1] 1611; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] 1612; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] 1613; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] 1614; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7] 1615; AVX2-NEXT: vmovdqa (%r8), %xmm10 1616; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,1,3,3,4,5,6,7] 1617; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] 1618; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm5[1,2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7] 1619; AVX2-NEXT: vmovdqa (%r9), %xmm11 1620; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,2,2,3,4,5,6,7] 1621; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] 1622; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] 1623; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] 1624; AVX2-NEXT: vpblendvb %ymm0, %ymm5, %ymm12, %ymm5 1625; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1626; AVX2-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 1627; AVX2-NEXT: vpsrldq {{.*#+}} ymm14 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 1628; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] 1629; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,1,2,3,6,5,6,7] 1630; AVX2-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 1631; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,1,2,3,6,5,6,7] 1632; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 1633; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] 1634; AVX2-NEXT: vmovdqa (%r9), %ymm12 1635; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] 1636; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] 1637; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] 1638; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 1639; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] 1640; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] 1641; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 1642; AVX2-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 1643; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] 1644; AVX2-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm0 1645; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1646; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 1647; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,1,1] 1648; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 1649; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,2,3,3] 1650; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] 1651; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] 1652; AVX2-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] 1653; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] 1654; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] 1655; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[2,3,2,3] 1656; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,1,4,5,6,7] 1657; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] 1658; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 1659; AVX2-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 1660; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] 1661; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] 1662; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] 1663; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] 1664; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] 1665; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] 1666; AVX2-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] 1667; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] 1668; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7] 1669; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[2,3,2,3,6,7,6,7] 1670; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 1671; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] 1672; AVX2-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm0 1673; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 1674; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 1675; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] 1676; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,0,2,2] 1677; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 1678; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] 1679; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero 1680; AVX2-NEXT: vpbroadcastq %xmm6, %ymm6 1681; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] 1682; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] 1683; AVX2-NEXT: vpbroadcastq %xmm6, %ymm6 1684; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] 1685; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 1686; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] 1687; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] 1688; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 1689; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] 1690; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 1691; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] 1692; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 1693; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 1694; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 1695; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 1696; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 1697; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 1698; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 1699; AVX2-NEXT: vmovdqa %ymm1, 96(%rax) 1700; AVX2-NEXT: vmovdqa %ymm0, 160(%rax) 1701; AVX2-NEXT: vmovdqa %ymm14, 64(%rax) 1702; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1703; AVX2-NEXT: vmovaps %ymm0, 128(%rax) 1704; AVX2-NEXT: vmovdqa %ymm5, (%rax) 1705; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1706; AVX2-NEXT: vmovaps %ymm0, 32(%rax) 1707; AVX2-NEXT: vzeroupper 1708; AVX2-NEXT: retq 1709; 1710; AVX2-FP-LABEL: store_i16_stride6_vf16: 1711; AVX2-FP: # %bb.0: 1712; AVX2-FP-NEXT: subq $24, %rsp 1713; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 1714; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm10 1715; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm1 1716; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm2 1717; AVX2-FP-NEXT: vmovdqa (%r8), %ymm8 1718; AVX2-FP-NEXT: vmovaps (%r9), %ymm3 1719; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1720; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm7 1721; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm6 1722; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm9 1723; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1724; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm4 1725; AVX2-FP-NEXT: vmovdqa (%r8), %xmm5 1726; AVX2-FP-NEXT: vmovdqa (%r9), %xmm3 1727; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] 1728; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] 1729; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] 1730; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,2,3,3] 1731; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] 1732; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] 1733; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] 1734; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] 1735; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] 1736; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] 1737; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] 1738; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 1739; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm9 1740; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1741; AVX2-FP-NEXT: vmovdqa %ymm0, %ymm9 1742; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm11 1743; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm10[4],ymm0[5],ymm10[5],ymm0[6],ymm10[6],ymm0[7],ymm10[7],ymm0[12],ymm10[12],ymm0[13],ymm10[13],ymm0[14],ymm10[14],ymm0[15],ymm10[15] 1744; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[3,3,3,3] 1745; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm0 1746; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm10 1747; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 1748; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7] 1749; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] 1750; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] 1751; AVX2-FP-NEXT: vmovdqa %ymm8, %ymm1 1752; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] 1753; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] 1754; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7] 1755; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 1756; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] 1757; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] 1758; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm13, %ymm15, %ymm2 1759; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1760; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] 1761; AVX2-FP-NEXT: vmovdqa %xmm6, %xmm13 1762; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] 1763; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1764; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] 1765; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] 1766; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] 1767; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] 1768; AVX2-FP-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero 1769; AVX2-FP-NEXT: vpbroadcastq %xmm15, %ymm15 1770; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] 1771; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[0,0,2,1,4,5,6,7] 1772; AVX2-FP-NEXT: vpbroadcastq %xmm15, %ymm15 1773; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] 1774; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm14, %ymm15, %ymm12 1775; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1776; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11] 1777; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] 1778; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[8],ymm10[8],ymm0[9],ymm10[9],ymm0[10],ymm10[10],ymm0[11],ymm10[11] 1779; AVX2-FP-NEXT: vmovdqa %ymm0, %ymm14 1780; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,0,2,2,5,4,6,6] 1781; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] 1782; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6],ymm12[7] 1783; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 1784; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] 1785; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7] 1786; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 1787; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] 1788; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm15 1789; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] 1790; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm6 1791; AVX2-FP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 1792; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] 1793; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm6 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 1794; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm7 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 1795; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 1796; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 1797; AVX2-FP-NEXT: vpbroadcastq %xmm6, %ymm6 1798; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] 1799; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,1,3,3,4,5,6,7] 1800; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] 1801; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7] 1802; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] 1803; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] 1804; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] 1805; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm6, %ymm0 1806; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] 1807; AVX2-FP-NEXT: vpshufb %ymm6, %ymm11, %ymm5 1808; AVX2-FP-NEXT: vpshufb %ymm6, %ymm9, %ymm3 1809; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] 1810; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 1811; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 1812; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] 1813; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] 1814; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 1815; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] 1816; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 1817; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 1818; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] 1819; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] 1820; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 1821; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 1822; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1823; AVX2-FP-NEXT: vmovdqa %ymm1, 128(%rax) 1824; AVX2-FP-NEXT: vmovdqa %ymm15, 96(%rax) 1825; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 1826; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rax) 1827; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 1828; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rax) 1829; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rax) 1830; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1831; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) 1832; AVX2-FP-NEXT: addq $24, %rsp 1833; AVX2-FP-NEXT: vzeroupper 1834; AVX2-FP-NEXT: retq 1835; 1836; AVX2-FCP-LABEL: store_i16_stride6_vf16: 1837; AVX2-FCP: # %bb.0: 1838; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm13 1839; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm1 1840; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm3 1841; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm4 1842; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm2 1843; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm6 1844; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] 1845; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm8 1846; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm7 1847; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 1848; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] 1849; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 1850; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm8 1851; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm9 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 1852; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm10 1853; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm11 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 1854; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] 1855; AVX2-FCP-NEXT: vpbroadcastq %xmm9, %ymm9 1856; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7] 1857; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm9 1858; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,1,3,3,4,5,6,7] 1859; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] 1860; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm5[1,2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7] 1861; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm11 1862; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] 1863; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] 1864; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] 1865; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm5, %ymm12, %ymm5 1866; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1867; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] 1868; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm14 1869; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm12 1870; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[8],ymm14[8],ymm12[9],ymm14[9],ymm12[10],ymm14[10],ymm12[11],ymm14[11] 1871; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 1872; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm15 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 1873; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] 1874; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm12 1875; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 1876; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] 1877; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] 1878; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 1879; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] 1880; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] 1881; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] 1882; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] 1883; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm0 1884; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1885; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] 1886; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [1,0,0,2,0,0,3,0] 1887; AVX2-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm14 1888; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 1889; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1] 1890; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] 1891; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] 1892; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] 1893; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] 1894; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] 1895; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] 1896; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 1897; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 1898; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] 1899; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [5,0,0,6,0,0,7,0] 1900; AVX2-FCP-NEXT: vpermd %ymm15, %ymm5, %ymm5 1901; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm13[4],ymm1[4],ymm13[5],ymm1[5],ymm13[6],ymm1[6],ymm13[7],ymm1[7],ymm13[12],ymm1[12],ymm13[13],ymm1[13],ymm13[14],ymm1[14],ymm13[15],ymm1[15] 1902; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] 1903; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] 1904; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] 1905; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] 1906; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7] 1907; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] 1908; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] 1909; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm0 1910; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] 1911; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2] 1912; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] 1913; AVX2-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 1914; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 1915; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] 1916; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] 1917; AVX2-FCP-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero 1918; AVX2-FCP-NEXT: vpbroadcastq %xmm6, %ymm6 1919; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] 1920; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] 1921; AVX2-FCP-NEXT: vpbroadcastq %xmm6, %ymm6 1922; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] 1923; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 1924; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] 1925; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,0,0,5,0,0,6] 1926; AVX2-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 1927; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] 1928; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 1929; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] 1930; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 1931; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 1932; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 1933; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 1934; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 1935; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 1936; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1937; AVX2-FCP-NEXT: vmovdqa %ymm1, 96(%rax) 1938; AVX2-FCP-NEXT: vmovdqa %ymm0, 160(%rax) 1939; AVX2-FCP-NEXT: vmovdqa %ymm14, 64(%rax) 1940; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1941; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rax) 1942; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rax) 1943; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1944; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) 1945; AVX2-FCP-NEXT: vzeroupper 1946; AVX2-FCP-NEXT: retq 1947; 1948; AVX512-LABEL: store_i16_stride6_vf16: 1949; AVX512: # %bb.0: 1950; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 1951; AVX512-NEXT: vmovdqa (%rdi), %ymm0 1952; AVX512-NEXT: vmovdqa (%rsi), %ymm2 1953; AVX512-NEXT: vmovdqa (%rdx), %ymm4 1954; AVX512-NEXT: vmovdqa (%rcx), %ymm5 1955; AVX512-NEXT: vmovdqa (%r8), %ymm1 1956; AVX512-NEXT: vmovdqa (%r9), %ymm3 1957; AVX512-NEXT: vpsrldq {{.*#+}} ymm6 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 1958; AVX512-NEXT: vpsrldq {{.*#+}} ymm7 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 1959; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] 1960; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[2,1,2,3,6,5,6,7] 1961; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 1962; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7] 1963; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 1964; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] 1965; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,5,12,0,4,13,0,7] 1966; AVX512-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 1967; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7] 1968; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] 1969; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] 1970; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] 1971; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] 1972; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 1973; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] 1974; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [5,0,14,6,0,15,7,0] 1975; AVX512-NEXT: vpermi2d %ymm7, %ymm8, %ymm9 1976; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm7 1977; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23] 1978; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] 1979; AVX512-NEXT: vpermi2d %zmm9, %zmm7, %zmm8 1980; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm16 1981; AVX512-NEXT: vmovdqa (%rcx), %xmm6 1982; AVX512-NEXT: vmovdqa (%rdx), %xmm7 1983; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 1984; AVX512-NEXT: vmovdqa (%rsi), %xmm8 1985; AVX512-NEXT: vmovdqa (%rdi), %xmm10 1986; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] 1987; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,8,0,1,9,0,2,10] 1988; AVX512-NEXT: vpermi2d %ymm9, %ymm11, %ymm13 1989; AVX512-NEXT: vmovdqa (%r9), %xmm9 1990; AVX512-NEXT: vmovdqa (%r8), %xmm11 1991; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] 1992; AVX512-NEXT: vpbroadcastq %xmm14, %ymm14 1993; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] 1994; AVX512-NEXT: vpsrldq {{.*#+}} xmm14 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 1995; AVX512-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 1996; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] 1997; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[0,1,2,1] 1998; AVX512-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,5] 1999; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,1,2,1] 2000; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] 2001; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] 2002; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3] 2003; AVX512-NEXT: vpermi2d %ymm14, %ymm12, %ymm15 2004; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] 2005; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[1,2,2,3] 2006; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] 2007; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] 2008; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7] 2009; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 2010; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] 2011; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 2012; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,12,0,5,13,0,6,14] 2013; AVX512-NEXT: vpermi2d %ymm4, %ymm0, %ymm2 2014; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] 2015; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 2016; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] 2017; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] 2018; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 2019; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,10,2,0,11,3,0] 2020; AVX512-NEXT: vpermi2d %ymm1, %ymm2, %ymm3 2021; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] 2022; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,2,3,10,5,6,11] 2023; AVX512-NEXT: vpermi2d %ymm1, %ymm3, %ymm2 2024; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 2025; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax) 2026; AVX512-NEXT: vmovdqa64 %zmm12, (%rax) 2027; AVX512-NEXT: vmovdqa64 %zmm16, 128(%rax) 2028; AVX512-NEXT: vzeroupper 2029; AVX512-NEXT: retq 2030; 2031; AVX512-FCP-LABEL: store_i16_stride6_vf16: 2032; AVX512-FCP: # %bb.0: 2033; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2034; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 2035; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm3 2036; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm4 2037; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm5 2038; AVX512-FCP-NEXT: vmovdqa64 (%r8), %ymm16 2039; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm1 2040; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm6 2041; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm7 2042; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 2043; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm8 2044; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm10 2045; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] 2046; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10] 2047; AVX512-FCP-NEXT: vpermi2d %ymm9, %ymm11, %ymm12 2048; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm9 2049; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm11 2050; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] 2051; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7] 2052; AVX512-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm14 2053; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] 2054; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm13 2055; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm12 2056; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] 2057; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 2058; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 2059; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] 2060; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3] 2061; AVX512-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm15 2062; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm12 2063; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15] 2064; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,2,2,3] 2065; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,2,2,3] 2066; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] 2067; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm12, %zmm13 2068; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm0 2069; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] 2070; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 2071; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,0,10,2,0,11,3,0] 2072; AVX512-FCP-NEXT: vpermi2d %ymm8, %ymm6, %ymm7 2073; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] 2074; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11] 2075; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 2076; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] 2077; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] 2078; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,12,0,5,13,0,6,14] 2079; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm9 2080; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6 2081; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15] 2082; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm10 2083; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11] 2084; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm6, %zmm7 2085; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm6 2086; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] 2087; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm8 2088; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm7 2089; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] 2090; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm8 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 2091; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm9 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 2092; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] 2093; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7] 2094; AVX512-FCP-NEXT: vpermi2d %ymm8, %ymm7, %ymm9 2095; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] 2096; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7] 2097; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] 2098; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] 2099; AVX512-FCP-NEXT: vpermi2d %ymm7, %ymm9, %ymm8 2100; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] 2101; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] 2102; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [5,0,14,6,0,15,7,0] 2103; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm4 2104; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2 2105; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23] 2106; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15] 2107; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 2108; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm1 2109; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) 2110; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%rax) 2111; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 2112; AVX512-FCP-NEXT: vzeroupper 2113; AVX512-FCP-NEXT: retq 2114; 2115; AVX512DQ-LABEL: store_i16_stride6_vf16: 2116; AVX512DQ: # %bb.0: 2117; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 2118; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 2119; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm2 2120; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm4 2121; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm5 2122; AVX512DQ-NEXT: vmovdqa (%r8), %ymm1 2123; AVX512DQ-NEXT: vmovdqa (%r9), %ymm3 2124; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm6 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 2125; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm7 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 2126; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] 2127; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[2,1,2,3,6,5,6,7] 2128; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 2129; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7] 2130; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 2131; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] 2132; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,5,12,0,4,13,0,7] 2133; AVX512DQ-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 2134; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7] 2135; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] 2136; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] 2137; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] 2138; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] 2139; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 2140; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] 2141; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [5,0,14,6,0,15,7,0] 2142; AVX512DQ-NEXT: vpermi2d %ymm7, %ymm8, %ymm9 2143; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm7 2144; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23] 2145; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] 2146; AVX512DQ-NEXT: vpermi2d %zmm9, %zmm7, %zmm8 2147; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm16 2148; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm6 2149; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm7 2150; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 2151; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm8 2152; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm10 2153; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] 2154; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,8,0,1,9,0,2,10] 2155; AVX512DQ-NEXT: vpermi2d %ymm9, %ymm11, %ymm13 2156; AVX512DQ-NEXT: vmovdqa (%r9), %xmm9 2157; AVX512DQ-NEXT: vmovdqa (%r8), %xmm11 2158; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] 2159; AVX512DQ-NEXT: vpbroadcastq %xmm14, %ymm14 2160; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] 2161; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm14 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 2162; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 2163; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] 2164; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[0,1,2,1] 2165; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,5] 2166; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,1,2,1] 2167; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] 2168; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] 2169; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3] 2170; AVX512DQ-NEXT: vpermi2d %ymm14, %ymm12, %ymm15 2171; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] 2172; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[1,2,2,3] 2173; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] 2174; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] 2175; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7] 2176; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 2177; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] 2178; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 2179; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,12,0,5,13,0,6,14] 2180; AVX512DQ-NEXT: vpermi2d %ymm4, %ymm0, %ymm2 2181; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] 2182; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 2183; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] 2184; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] 2185; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 2186; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,10,2,0,11,3,0] 2187; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm2, %ymm3 2188; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] 2189; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,2,3,10,5,6,11] 2190; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm3, %ymm2 2191; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 2192; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax) 2193; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rax) 2194; AVX512DQ-NEXT: vmovdqa64 %zmm16, 128(%rax) 2195; AVX512DQ-NEXT: vzeroupper 2196; AVX512DQ-NEXT: retq 2197; 2198; AVX512DQ-FCP-LABEL: store_i16_stride6_vf16: 2199; AVX512DQ-FCP: # %bb.0: 2200; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2201; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 2202; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm3 2203; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm4 2204; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm5 2205; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %ymm16 2206; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm1 2207; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm6 2208; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm7 2209; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 2210; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm8 2211; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm10 2212; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] 2213; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10] 2214; AVX512DQ-FCP-NEXT: vpermi2d %ymm9, %ymm11, %ymm12 2215; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm9 2216; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm11 2217; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] 2218; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7] 2219; AVX512DQ-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm14 2220; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] 2221; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm13 2222; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm12 2223; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] 2224; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 2225; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 2226; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] 2227; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3] 2228; AVX512DQ-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm15 2229; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm12 2230; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15] 2231; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,2,2,3] 2232; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,2,2,3] 2233; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] 2234; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm12, %zmm13 2235; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm0 2236; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] 2237; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 2238; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,0,10,2,0,11,3,0] 2239; AVX512DQ-FCP-NEXT: vpermi2d %ymm8, %ymm6, %ymm7 2240; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] 2241; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11] 2242; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 2243; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] 2244; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] 2245; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,12,0,5,13,0,6,14] 2246; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm9 2247; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6 2248; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15] 2249; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm10 2250; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11] 2251; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm6, %zmm7 2252; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm6 2253; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] 2254; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm8 2255; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm7 2256; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] 2257; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm8 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 2258; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm9 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 2259; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] 2260; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7] 2261; AVX512DQ-FCP-NEXT: vpermi2d %ymm8, %ymm7, %ymm9 2262; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] 2263; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7] 2264; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] 2265; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] 2266; AVX512DQ-FCP-NEXT: vpermi2d %ymm7, %ymm9, %ymm8 2267; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] 2268; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] 2269; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [5,0,14,6,0,15,7,0] 2270; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm4 2271; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2 2272; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23] 2273; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15] 2274; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 2275; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm1 2276; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) 2277; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%rax) 2278; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 2279; AVX512DQ-FCP-NEXT: vzeroupper 2280; AVX512DQ-FCP-NEXT: retq 2281; 2282; AVX512BW-LABEL: store_i16_stride6_vf16: 2283; AVX512BW: # %bb.0: 2284; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 2285; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 2286; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 2287; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 2288; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 2289; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 2290; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 2291; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,48,0,0,1,17,33,49,0,0,2,18,34,50,0,0,3,19,35,51,0,0,4,20,36,52,0,0,5,21] 2292; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 2293; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] 2294; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 2295; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [5,21,0,0,38,54,6,22,0,0,39,55,7,23,0,0,40,56,8,24,0,0,41,57,9,25,0,0,42,58,10,26] 2296; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 2297; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] 2298; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 2299; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,11,27,43,59,0,0,12,28,44,60,0,0,13,29,45,61,0,0,14,30,46,62,0,0,15,31,47,63,0,0] 2300; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 2301; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] 2302; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 2303; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) 2304; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) 2305; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) 2306; AVX512BW-NEXT: vzeroupper 2307; AVX512BW-NEXT: retq 2308; 2309; AVX512BW-FCP-LABEL: store_i16_stride6_vf16: 2310; AVX512BW-FCP: # %bb.0: 2311; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2312; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 2313; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 2314; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm2 2315; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 2316; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 2317; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 2318; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,48,0,0,1,17,33,49,0,0,2,18,34,50,0,0,3,19,35,51,0,0,4,20,36,52,0,0,5,21] 2319; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 2320; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] 2321; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 2322; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [5,21,0,0,38,54,6,22,0,0,39,55,7,23,0,0,40,56,8,24,0,0,41,57,9,25,0,0,42,58,10,26] 2323; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 2324; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] 2325; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 2326; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,11,27,43,59,0,0,12,28,44,60,0,0,13,29,45,61,0,0,14,30,46,62,0,0,15,31,47,63,0,0] 2327; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 2328; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] 2329; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 2330; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) 2331; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) 2332; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 2333; AVX512BW-FCP-NEXT: vzeroupper 2334; AVX512BW-FCP-NEXT: retq 2335; 2336; AVX512DQ-BW-LABEL: store_i16_stride6_vf16: 2337; AVX512DQ-BW: # %bb.0: 2338; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 2339; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 2340; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 2341; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm2 2342; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 2343; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 2344; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 2345; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,48,0,0,1,17,33,49,0,0,2,18,34,50,0,0,3,19,35,51,0,0,4,20,36,52,0,0,5,21] 2346; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 2347; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] 2348; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 2349; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [5,21,0,0,38,54,6,22,0,0,39,55,7,23,0,0,40,56,8,24,0,0,41,57,9,25,0,0,42,58,10,26] 2350; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 2351; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] 2352; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 2353; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,11,27,43,59,0,0,12,28,44,60,0,0,13,29,45,61,0,0,14,30,46,62,0,0,15,31,47,63,0,0] 2354; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 2355; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] 2356; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 2357; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) 2358; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rax) 2359; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rax) 2360; AVX512DQ-BW-NEXT: vzeroupper 2361; AVX512DQ-BW-NEXT: retq 2362; 2363; AVX512DQ-BW-FCP-LABEL: store_i16_stride6_vf16: 2364; AVX512DQ-BW-FCP: # %bb.0: 2365; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2366; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 2367; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 2368; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm2 2369; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 2370; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 2371; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 2372; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,48,0,0,1,17,33,49,0,0,2,18,34,50,0,0,3,19,35,51,0,0,4,20,36,52,0,0,5,21] 2373; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 2374; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] 2375; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 2376; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [5,21,0,0,38,54,6,22,0,0,39,55,7,23,0,0,40,56,8,24,0,0,41,57,9,25,0,0,42,58,10,26] 2377; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 2378; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] 2379; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 2380; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,11,27,43,59,0,0,12,28,44,60,0,0,13,29,45,61,0,0,14,30,46,62,0,0,15,31,47,63,0,0] 2381; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 2382; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] 2383; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 2384; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) 2385; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) 2386; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 2387; AVX512DQ-BW-FCP-NEXT: vzeroupper 2388; AVX512DQ-BW-FCP-NEXT: retq 2389 %in.vec0 = load <16 x i16>, ptr %in.vecptr0, align 64 2390 %in.vec1 = load <16 x i16>, ptr %in.vecptr1, align 64 2391 %in.vec2 = load <16 x i16>, ptr %in.vecptr2, align 64 2392 %in.vec3 = load <16 x i16>, ptr %in.vecptr3, align 64 2393 %in.vec4 = load <16 x i16>, ptr %in.vecptr4, align 64 2394 %in.vec5 = load <16 x i16>, ptr %in.vecptr5, align 64 2395 %1 = shufflevector <16 x i16> %in.vec0, <16 x i16> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2396 %2 = shufflevector <16 x i16> %in.vec2, <16 x i16> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2397 %3 = shufflevector <16 x i16> %in.vec4, <16 x i16> %in.vec5, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2398 %4 = shufflevector <32 x i16> %1, <32 x i16> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 2399 %5 = shufflevector <32 x i16> %3, <32 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2400 %6 = shufflevector <64 x i16> %4, <64 x i16> %5, <96 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95> 2401 %interleaved.vec = shufflevector <96 x i16> %6, <96 x i16> poison, <96 x i32> <i32 0, i32 16, i32 32, i32 48, i32 64, i32 80, i32 1, i32 17, i32 33, i32 49, i32 65, i32 81, i32 2, i32 18, i32 34, i32 50, i32 66, i32 82, i32 3, i32 19, i32 35, i32 51, i32 67, i32 83, i32 4, i32 20, i32 36, i32 52, i32 68, i32 84, i32 5, i32 21, i32 37, i32 53, i32 69, i32 85, i32 6, i32 22, i32 38, i32 54, i32 70, i32 86, i32 7, i32 23, i32 39, i32 55, i32 71, i32 87, i32 8, i32 24, i32 40, i32 56, i32 72, i32 88, i32 9, i32 25, i32 41, i32 57, i32 73, i32 89, i32 10, i32 26, i32 42, i32 58, i32 74, i32 90, i32 11, i32 27, i32 43, i32 59, i32 75, i32 91, i32 12, i32 28, i32 44, i32 60, i32 76, i32 92, i32 13, i32 29, i32 45, i32 61, i32 77, i32 93, i32 14, i32 30, i32 46, i32 62, i32 78, i32 94, i32 15, i32 31, i32 47, i32 63, i32 79, i32 95> 2402 store <96 x i16> %interleaved.vec, ptr %out.vec, align 64 2403 ret void 2404} 2405 2406define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { 2407; SSE-LABEL: store_i16_stride6_vf32: 2408; SSE: # %bb.0: 2409; SSE-NEXT: subq $312, %rsp # imm = 0x138 2410; SSE-NEXT: movdqa (%rdi), %xmm2 2411; SSE-NEXT: movdqa 16(%rdi), %xmm13 2412; SSE-NEXT: movdqa (%rsi), %xmm3 2413; SSE-NEXT: movdqa 16(%rsi), %xmm1 2414; SSE-NEXT: movdqa (%rdx), %xmm5 2415; SSE-NEXT: movdqa 16(%rdx), %xmm14 2416; SSE-NEXT: movdqa (%rcx), %xmm4 2417; SSE-NEXT: movdqa 16(%rcx), %xmm10 2418; SSE-NEXT: movdqa (%r8), %xmm8 2419; SSE-NEXT: movdqa (%r9), %xmm11 2420; SSE-NEXT: movdqa %xmm5, %xmm0 2421; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 2422; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2423; SSE-NEXT: movdqa %xmm2, %xmm9 2424; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] 2425; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2426; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm0[3,3] 2427; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[2,1,3,3,4,5,6,7] 2428; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,2],xmm7[0,1] 2429; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] 2430; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,0] 2431; SSE-NEXT: andps %xmm6, %xmm9 2432; SSE-NEXT: movdqa %xmm11, %xmm7 2433; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2434; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] 2435; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] 2436; SSE-NEXT: movaps %xmm6, %xmm0 2437; SSE-NEXT: andnps %xmm11, %xmm0 2438; SSE-NEXT: orps %xmm9, %xmm0 2439; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2440; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 2441; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2442; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 2443; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2444; SSE-NEXT: movdqa %xmm2, %xmm3 2445; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm5[3,3] 2446; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,7,7] 2447; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm4[2,3] 2448; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0,1,3] 2449; SSE-NEXT: andps %xmm6, %xmm3 2450; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] 2451; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] 2452; SSE-NEXT: movaps %xmm6, %xmm0 2453; SSE-NEXT: andnps %xmm4, %xmm0 2454; SSE-NEXT: orps %xmm3, %xmm0 2455; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2456; SSE-NEXT: movdqa %xmm14, %xmm0 2457; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] 2458; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2459; SSE-NEXT: movdqa %xmm13, %xmm11 2460; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] 2461; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2462; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm0[3,3] 2463; SSE-NEXT: movdqa 16(%r8), %xmm15 2464; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm15[2,1,3,3,4,5,6,7] 2465; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm9[0,1] 2466; SSE-NEXT: movdqa 16(%r9), %xmm9 2467; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm9[0,2,2,3,4,5,6,7] 2468; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] 2469; SSE-NEXT: movaps %xmm6, %xmm0 2470; SSE-NEXT: andnps %xmm12, %xmm0 2471; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3] 2472; SSE-NEXT: andps %xmm6, %xmm11 2473; SSE-NEXT: orps %xmm11, %xmm0 2474; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2475; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] 2476; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2477; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] 2478; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2479; SSE-NEXT: movdqa %xmm13, %xmm1 2480; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm14[3,3] 2481; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,6,5,7,7] 2482; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm10[2,3] 2483; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,6,6,7] 2484; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] 2485; SSE-NEXT: movaps %xmm6, %xmm0 2486; SSE-NEXT: andnps %xmm10, %xmm0 2487; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] 2488; SSE-NEXT: andps %xmm6, %xmm1 2489; SSE-NEXT: orps %xmm1, %xmm0 2490; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2491; SSE-NEXT: movdqa 32(%rdx), %xmm2 2492; SSE-NEXT: movdqa 32(%rcx), %xmm1 2493; SSE-NEXT: movdqa %xmm2, %xmm0 2494; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2495; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2496; SSE-NEXT: movdqa 32(%rdi), %xmm3 2497; SSE-NEXT: movdqa 32(%rsi), %xmm11 2498; SSE-NEXT: movdqa %xmm3, %xmm10 2499; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] 2500; SSE-NEXT: movdqa %xmm10, %xmm12 2501; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm0[3,3] 2502; SSE-NEXT: movdqa 32(%r8), %xmm14 2503; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm14[2,1,3,3,4,5,6,7] 2504; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,2],xmm13[0,1] 2505; SSE-NEXT: movdqa 32(%r9), %xmm4 2506; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] 2507; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2508; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 2509; SSE-NEXT: movaps %xmm6, %xmm13 2510; SSE-NEXT: andnps %xmm0, %xmm13 2511; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3] 2512; SSE-NEXT: andps %xmm6, %xmm12 2513; SSE-NEXT: orps %xmm12, %xmm13 2514; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2515; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 2516; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2517; SSE-NEXT: movdqa %xmm3, %xmm0 2518; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] 2519; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2520; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[3,3] 2521; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,7,7] 2522; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] 2523; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,6,6,7] 2524; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 2525; SSE-NEXT: movaps %xmm6, %xmm11 2526; SSE-NEXT: andnps %xmm1, %xmm11 2527; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] 2528; SSE-NEXT: andps %xmm6, %xmm0 2529; SSE-NEXT: orps %xmm0, %xmm11 2530; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2531; SSE-NEXT: movdqa 48(%rdx), %xmm3 2532; SSE-NEXT: movdqa 48(%rcx), %xmm4 2533; SSE-NEXT: movdqa %xmm3, %xmm5 2534; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 2535; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2536; SSE-NEXT: movdqa 48(%rdi), %xmm0 2537; SSE-NEXT: movdqa 48(%rsi), %xmm1 2538; SSE-NEXT: movdqa %xmm0, %xmm11 2539; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] 2540; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2541; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[3,3] 2542; SSE-NEXT: movdqa 48(%r8), %xmm12 2543; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm12[2,1,3,3,4,5,6,7] 2544; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm13[0,1] 2545; SSE-NEXT: movdqa 48(%r9), %xmm2 2546; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm2[0,2,2,3,4,5,6,7] 2547; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill 2548; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] 2549; SSE-NEXT: movaps %xmm6, %xmm7 2550; SSE-NEXT: andnps %xmm13, %xmm7 2551; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3] 2552; SSE-NEXT: andps %xmm6, %xmm11 2553; SSE-NEXT: orps %xmm11, %xmm7 2554; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2555; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 2556; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2557; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2558; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2559; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[3,3] 2560; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,6,5,7,7] 2561; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] 2562; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] 2563; SSE-NEXT: andps %xmm6, %xmm0 2564; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] 2565; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 2566; SSE-NEXT: andnps %xmm1, %xmm6 2567; SSE-NEXT: orps %xmm0, %xmm6 2568; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2569; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 2570; SSE-NEXT: movaps %xmm5, %xmm0 2571; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2572; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] 2573; SSE-NEXT: movdqa %xmm8, %xmm1 2574; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[1,3] 2575; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] 2576; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2577; SSE-NEXT: movdqa %xmm3, %xmm11 2578; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] 2579; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] 2580; SSE-NEXT: movdqa %xmm1, %xmm2 2581; SSE-NEXT: pandn %xmm11, %xmm2 2582; SSE-NEXT: andps %xmm1, %xmm0 2583; SSE-NEXT: por %xmm0, %xmm2 2584; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2585; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] 2586; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[1,1,1,1,4,5,6,7] 2587; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm5[1,1] 2588; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm4[0,2] 2589; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] 2590; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] 2591; SSE-NEXT: movdqa %xmm5, %xmm0 2592; SSE-NEXT: pandn %xmm6, %xmm0 2593; SSE-NEXT: andps %xmm5, %xmm11 2594; SSE-NEXT: por %xmm11, %xmm0 2595; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2596; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 2597; SSE-NEXT: movaps %xmm7, %xmm6 2598; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2599; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] 2600; SSE-NEXT: movdqa %xmm8, %xmm11 2601; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,1],xmm4[1,3] 2602; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm11[0,2] 2603; SSE-NEXT: movdqa %xmm3, %xmm0 2604; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,2,3,3] 2605; SSE-NEXT: pslld $16, %xmm0 2606; SSE-NEXT: movdqa %xmm1, %xmm2 2607; SSE-NEXT: pandn %xmm0, %xmm2 2608; SSE-NEXT: andps %xmm1, %xmm6 2609; SSE-NEXT: por %xmm6, %xmm2 2610; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2611; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] 2612; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 2613; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1] 2614; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[0,2] 2615; SSE-NEXT: movdqa %xmm5, %xmm0 2616; SSE-NEXT: pandn %xmm11, %xmm0 2617; SSE-NEXT: andps %xmm5, %xmm8 2618; SSE-NEXT: por %xmm8, %xmm0 2619; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2620; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2621; SSE-NEXT: movaps %xmm3, %xmm6 2622; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2623; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] 2624; SSE-NEXT: movdqa %xmm15, %xmm2 2625; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,3] 2626; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[0,2] 2627; SSE-NEXT: movdqa %xmm9, %xmm8 2628; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5] 2629; SSE-NEXT: movdqa %xmm1, %xmm2 2630; SSE-NEXT: pandn %xmm8, %xmm2 2631; SSE-NEXT: andps %xmm1, %xmm6 2632; SSE-NEXT: por %xmm6, %xmm2 2633; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2634; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] 2635; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm15[1,1,1,1,4,5,6,7] 2636; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1] 2637; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[0,2] 2638; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,1,1] 2639; SSE-NEXT: movdqa %xmm5, %xmm0 2640; SSE-NEXT: pandn %xmm8, %xmm0 2641; SSE-NEXT: andps %xmm5, %xmm6 2642; SSE-NEXT: por %xmm6, %xmm0 2643; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2644; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 2645; SSE-NEXT: movaps %xmm2, %xmm8 2646; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2647; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] 2648; SSE-NEXT: movdqa %xmm15, %xmm6 2649; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm0[1,3] 2650; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm6[0,2] 2651; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[2,2,3,3] 2652; SSE-NEXT: pslld $16, %xmm9 2653; SSE-NEXT: movdqa %xmm1, %xmm7 2654; SSE-NEXT: pandn %xmm9, %xmm7 2655; SSE-NEXT: andps %xmm1, %xmm8 2656; SSE-NEXT: por %xmm8, %xmm7 2657; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 2658; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 2659; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm2[1,1] 2660; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[0,2] 2661; SSE-NEXT: movdqa %xmm5, %xmm8 2662; SSE-NEXT: pandn %xmm11, %xmm8 2663; SSE-NEXT: andps %xmm5, %xmm15 2664; SSE-NEXT: por %xmm15, %xmm8 2665; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 2666; SSE-NEXT: movdqa %xmm2, %xmm9 2667; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0] 2668; SSE-NEXT: movdqa %xmm14, %xmm3 2669; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm10[1,3] 2670; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm3[0,2] 2671; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2672; SSE-NEXT: movdqa %xmm0, %xmm11 2673; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] 2674; SSE-NEXT: movdqa %xmm1, %xmm6 2675; SSE-NEXT: pandn %xmm11, %xmm6 2676; SSE-NEXT: andps %xmm1, %xmm9 2677; SSE-NEXT: por %xmm9, %xmm6 2678; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm2[1] 2679; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm14[1,1,1,1,4,5,6,7] 2680; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm2[1,1] 2681; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm10[0,2] 2682; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,1,1] 2683; SSE-NEXT: movdqa %xmm5, %xmm9 2684; SSE-NEXT: pandn %xmm13, %xmm9 2685; SSE-NEXT: andps %xmm5, %xmm11 2686; SSE-NEXT: por %xmm11, %xmm9 2687; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2688; SSE-NEXT: movaps %xmm3, %xmm11 2689; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 2690; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] 2691; SSE-NEXT: movdqa %xmm14, %xmm13 2692; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,1],xmm2[1,3] 2693; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm13[0,2] 2694; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[2,2,3,3] 2695; SSE-NEXT: pslld $16, %xmm0 2696; SSE-NEXT: movdqa %xmm1, %xmm15 2697; SSE-NEXT: pandn %xmm0, %xmm15 2698; SSE-NEXT: andps %xmm1, %xmm11 2699; SSE-NEXT: por %xmm11, %xmm15 2700; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] 2701; SSE-NEXT: psrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 2702; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1] 2703; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm2[0,2] 2704; SSE-NEXT: movdqa %xmm5, %xmm10 2705; SSE-NEXT: pandn %xmm13, %xmm10 2706; SSE-NEXT: andps %xmm5, %xmm14 2707; SSE-NEXT: por %xmm14, %xmm10 2708; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 2709; SSE-NEXT: movaps %xmm2, %xmm11 2710; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2711; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] 2712; SSE-NEXT: movdqa %xmm12, %xmm13 2713; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[1,3] 2714; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm13[0,2] 2715; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload 2716; SSE-NEXT: movdqa %xmm4, %xmm14 2717; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5] 2718; SSE-NEXT: movdqa %xmm1, %xmm13 2719; SSE-NEXT: pandn %xmm14, %xmm13 2720; SSE-NEXT: andps %xmm1, %xmm11 2721; SSE-NEXT: por %xmm11, %xmm13 2722; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 2723; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[1,1,1,1,4,5,6,7] 2724; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm2[1,1] 2725; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[0,2] 2726; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] 2727; SSE-NEXT: movdqa %xmm5, %xmm11 2728; SSE-NEXT: pandn %xmm2, %xmm11 2729; SSE-NEXT: andps %xmm5, %xmm14 2730; SSE-NEXT: por %xmm14, %xmm11 2731; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2732; SSE-NEXT: movaps %xmm3, %xmm2 2733; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2734; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 2735; SSE-NEXT: movdqa %xmm12, %xmm14 2736; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,1],xmm0[1,3] 2737; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm14[0,2] 2738; SSE-NEXT: andps %xmm1, %xmm2 2739; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[2,2,3,3] 2740; SSE-NEXT: pslld $16, %xmm4 2741; SSE-NEXT: pandn %xmm4, %xmm1 2742; SSE-NEXT: por %xmm2, %xmm1 2743; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] 2744; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 2745; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm3[1,1] 2746; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[0,2] 2747; SSE-NEXT: andps %xmm5, %xmm12 2748; SSE-NEXT: pandn %xmm14, %xmm5 2749; SSE-NEXT: por %xmm12, %xmm5 2750; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 2751; SSE-NEXT: movdqa %xmm5, 352(%rax) 2752; SSE-NEXT: movdqa %xmm1, 336(%rax) 2753; SSE-NEXT: movdqa %xmm11, 304(%rax) 2754; SSE-NEXT: movdqa %xmm13, 288(%rax) 2755; SSE-NEXT: movdqa %xmm10, 256(%rax) 2756; SSE-NEXT: movdqa %xmm15, 240(%rax) 2757; SSE-NEXT: movdqa %xmm9, 208(%rax) 2758; SSE-NEXT: movdqa %xmm6, 192(%rax) 2759; SSE-NEXT: movdqa %xmm8, 160(%rax) 2760; SSE-NEXT: movdqa %xmm7, 144(%rax) 2761; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2762; SSE-NEXT: movaps %xmm0, 112(%rax) 2763; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2764; SSE-NEXT: movaps %xmm0, 96(%rax) 2765; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2766; SSE-NEXT: movaps %xmm0, 64(%rax) 2767; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2768; SSE-NEXT: movaps %xmm0, 48(%rax) 2769; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2770; SSE-NEXT: movaps %xmm0, 16(%rax) 2771; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2772; SSE-NEXT: movaps %xmm0, (%rax) 2773; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2774; SSE-NEXT: movaps %xmm0, 368(%rax) 2775; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2776; SSE-NEXT: movaps %xmm0, 320(%rax) 2777; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2778; SSE-NEXT: movaps %xmm0, 272(%rax) 2779; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2780; SSE-NEXT: movaps %xmm0, 224(%rax) 2781; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2782; SSE-NEXT: movaps %xmm0, 176(%rax) 2783; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2784; SSE-NEXT: movaps %xmm0, 128(%rax) 2785; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2786; SSE-NEXT: movaps %xmm0, 80(%rax) 2787; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2788; SSE-NEXT: movaps %xmm0, 32(%rax) 2789; SSE-NEXT: addq $312, %rsp # imm = 0x138 2790; SSE-NEXT: retq 2791; 2792; AVX-LABEL: store_i16_stride6_vf32: 2793; AVX: # %bb.0: 2794; AVX-NEXT: subq $120, %rsp 2795; AVX-NEXT: vmovdqa 32(%rcx), %xmm8 2796; AVX-NEXT: vmovdqa 48(%rcx), %xmm0 2797; AVX-NEXT: vmovdqa 32(%rdx), %xmm9 2798; AVX-NEXT: vmovdqa 48(%rdx), %xmm1 2799; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2800; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] 2801; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2802; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,0,1,1] 2803; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm1 2804; AVX-NEXT: vmovdqa 32(%rsi), %xmm10 2805; AVX-NEXT: vmovdqa 48(%rsi), %xmm2 2806; AVX-NEXT: vmovdqa 32(%rdi), %xmm11 2807; AVX-NEXT: vmovdqa 48(%rdi), %xmm4 2808; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] 2809; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] 2810; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 2811; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] 2812; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 2813; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 2814; AVX-NEXT: vextractf128 $1, %ymm5, %xmm2 2815; AVX-NEXT: vmovdqa 48(%r8), %xmm1 2816; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] 2817; AVX-NEXT: vmovdqa 48(%r9), %xmm2 2818; AVX-NEXT: vpslld $16, %xmm2, %xmm12 2819; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm12[5],xmm7[6,7] 2820; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2821; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,1,3,3,4,5,6,7] 2822; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] 2823; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3] 2824; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] 2825; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] 2826; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3,4,5,6],xmm7[7] 2827; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2828; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] 2829; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 2830; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2] 2831; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 2832; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 2833; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 2834; AVX-NEXT: vextractf128 $1, %ymm3, %xmm4 2835; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,7,7] 2836; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] 2837; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] 2838; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,6,6,7] 2839; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] 2840; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] 2841; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2842; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 2843; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] 2844; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] 2845; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] 2846; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2847; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] 2848; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 2849; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,2] 2850; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] 2851; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 2852; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] 2853; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 2854; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] 2855; AVX-NEXT: vmovdqa 32(%r8), %xmm4 2856; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,5,7,7] 2857; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] 2858; AVX-NEXT: vextractf128 $1, %ymm12, %xmm13 2859; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm13[2,3,4,5],xmm5[6,7] 2860; AVX-NEXT: vmovdqa 32(%r9), %xmm5 2861; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,6,6,7] 2862; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] 2863; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6],xmm14[7] 2864; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2865; AVX-NEXT: vpsrldq {{.*#+}} xmm13 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 2866; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3] 2867; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,2,3,3] 2868; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3],xmm12[4,5,6,7] 2869; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2870; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,0,1,1] 2871; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] 2872; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6 2873; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,1,0,1] 2874; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 2875; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7] 2876; AVX-NEXT: vextractf128 $1, %ymm0, %xmm6 2877; AVX-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 2878; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3],xmm6[4,5,6,7] 2879; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,0,1,1] 2880; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3],xmm6[4,5,6,7] 2881; AVX-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill 2882; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 2883; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] 2884; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] 2885; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2886; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 2887; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 2888; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] 2889; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2890; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] 2891; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1] 2892; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm6 2893; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7] 2894; AVX-NEXT: vextractf128 $1, %ymm1, %xmm6 2895; AVX-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 2896; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6,7] 2897; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,0,1,1] 2898; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5,6,7] 2899; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2900; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] 2901; AVX-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] 2902; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5],xmm1[6,7] 2903; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2904; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 2905; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] 2906; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2907; AVX-NEXT: vmovdqa 16(%rcx), %xmm1 2908; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 2909; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] 2910; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 2911; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] 2912; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 2913; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5],xmm2[6,7] 2914; AVX-NEXT: vpslld $16, %xmm5, %xmm3 2915; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7] 2916; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2917; AVX-NEXT: vmovdqa 16(%rdx), %xmm2 2918; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] 2919; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 2920; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3] 2921; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,2,2,3,4,5,6,7] 2922; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 2923; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5,6],xmm3[7] 2924; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2925; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 2926; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] 2927; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 2928; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] 2929; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2930; AVX-NEXT: vmovdqa 16(%rsi), %xmm2 2931; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 2932; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 2933; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 2934; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] 2935; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,1] 2936; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 2937; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] 2938; AVX-NEXT: vmovdqa 16(%r8), %xmm3 2939; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 2940; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] 2941; AVX-NEXT: vmovdqa 16(%r9), %xmm2 2942; AVX-NEXT: vpslld $16, %xmm2, %xmm6 2943; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5],xmm5[6,7] 2944; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2945; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] 2946; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] 2947; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0],xmm0[1,2],xmm5[3] 2948; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7] 2949; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] 2950; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5,6],xmm5[7] 2951; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2952; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] 2953; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 2954; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,2] 2955; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 2956; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 2957; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 2958; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,7,7] 2959; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 2960; AVX-NEXT: vextractf128 $1, %ymm0, %xmm4 2961; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3,4,5],xmm1[6,7] 2962; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,6,6,7] 2963; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] 2964; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3,4,5,6],xmm4[7] 2965; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2966; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 2967; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 2968; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] 2969; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] 2970; AVX-NEXT: vmovdqa (%rcx), %xmm9 2971; AVX-NEXT: vmovdqa (%rdx), %xmm8 2972; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] 2973; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] 2974; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] 2975; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2976; AVX-NEXT: vmovdqa (%rsi), %xmm7 2977; AVX-NEXT: vmovdqa (%rdi), %xmm6 2978; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] 2979; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] 2980; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 2981; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 2982; AVX-NEXT: vmovdqa (%r8), %xmm1 2983; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,7,7] 2984; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 2985; AVX-NEXT: vextractf128 $1, %ymm12, %xmm13 2986; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3,4,5],xmm0[6,7] 2987; AVX-NEXT: vmovdqa (%r9), %xmm0 2988; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,4,6,6,7] 2989; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] 2990; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6],xmm14[7] 2991; AVX-NEXT: vpsrldq {{.*#+}} xmm14 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 2992; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3] 2993; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3] 2994; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3],xmm12[4,5,6,7] 2995; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,0,1,1] 2996; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] 2997; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm14, %ymm11 2998; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[0,1,0,1] 2999; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 3000; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7] 3001; AVX-NEXT: vextractf128 $1, %ymm11, %xmm10 3002; AVX-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 3003; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3],xmm10[4,5,6,7] 3004; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[0,0,1,1] 3005; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3],xmm10[4,5,6,7] 3006; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm11[0,1],xmm3[0],xmm11[3] 3007; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] 3008; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6,7] 3009; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] 3010; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] 3011; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[1,1,2,2] 3012; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 3013; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] 3014; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] 3015; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm7 3016; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] 3017; AVX-NEXT: vextractf128 $1, %ymm7, %xmm8 3018; AVX-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 3019; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] 3020; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] 3021; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5,6,7] 3022; AVX-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0],xmm7[3] 3023; AVX-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 3024; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm9[5],xmm7[6,7] 3025; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 3026; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] 3027; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 3028; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] 3029; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] 3030; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 3031; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 3032; AVX-NEXT: vextractf128 $1, %ymm3, %xmm4 3033; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7] 3034; AVX-NEXT: vpslld $16, %xmm0, %xmm5 3035; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5],xmm4[6,7] 3036; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] 3037; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 3038; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2],xmm1[3] 3039; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3040; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 3041; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6],xmm0[7] 3042; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 3043; AVX-NEXT: vmovdqa %xmm0, 32(%rax) 3044; AVX-NEXT: vmovdqa %xmm4, 48(%rax) 3045; AVX-NEXT: vmovdqa %xmm7, (%rax) 3046; AVX-NEXT: vmovdqa %xmm8, 16(%rax) 3047; AVX-NEXT: vmovdqa %xmm2, 96(%rax) 3048; AVX-NEXT: vmovdqa %xmm10, 112(%rax) 3049; AVX-NEXT: vmovdqa %xmm12, 64(%rax) 3050; AVX-NEXT: vmovdqa %xmm13, 80(%rax) 3051; AVX-NEXT: vmovdqa %xmm15, 160(%rax) 3052; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3053; AVX-NEXT: vmovaps %xmm0, 176(%rax) 3054; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3055; AVX-NEXT: vmovaps %xmm0, 128(%rax) 3056; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3057; AVX-NEXT: vmovaps %xmm0, 144(%rax) 3058; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3059; AVX-NEXT: vmovaps %xmm0, 224(%rax) 3060; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3061; AVX-NEXT: vmovaps %xmm0, 240(%rax) 3062; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3063; AVX-NEXT: vmovaps %xmm0, 192(%rax) 3064; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3065; AVX-NEXT: vmovaps %xmm0, 208(%rax) 3066; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3067; AVX-NEXT: vmovaps %xmm0, 288(%rax) 3068; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3069; AVX-NEXT: vmovaps %xmm0, 304(%rax) 3070; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3071; AVX-NEXT: vmovaps %xmm0, 256(%rax) 3072; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3073; AVX-NEXT: vmovaps %xmm0, 272(%rax) 3074; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3075; AVX-NEXT: vmovaps %xmm0, 352(%rax) 3076; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3077; AVX-NEXT: vmovaps %xmm0, 368(%rax) 3078; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3079; AVX-NEXT: vmovaps %xmm0, 320(%rax) 3080; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3081; AVX-NEXT: vmovaps %xmm0, 336(%rax) 3082; AVX-NEXT: addq $120, %rsp 3083; AVX-NEXT: vzeroupper 3084; AVX-NEXT: retq 3085; 3086; AVX2-LABEL: store_i16_stride6_vf32: 3087; AVX2: # %bb.0: 3088; AVX2-NEXT: subq $616, %rsp # imm = 0x268 3089; AVX2-NEXT: vmovdqa (%rcx), %xmm13 3090; AVX2-NEXT: vmovdqa 32(%rcx), %xmm9 3091; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 3092; AVX2-NEXT: vmovdqa (%rdx), %xmm1 3093; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3094; AVX2-NEXT: vmovdqa 32(%rdx), %xmm11 3095; AVX2-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 3096; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3097; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 3098; AVX2-NEXT: vmovdqa (%rsi), %xmm15 3099; AVX2-NEXT: vmovdqa 32(%rsi), %xmm5 3100; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,1,2,1] 3101; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] 3102; AVX2-NEXT: vmovdqa (%rdi), %xmm2 3103; AVX2-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill 3104; AVX2-NEXT: vmovdqa 32(%rdi), %xmm14 3105; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 3106; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] 3107; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 3108; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 3109; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 3110; AVX2-NEXT: vmovdqa (%r8), %xmm1 3111; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3112; AVX2-NEXT: vmovdqa 32(%r8), %xmm7 3113; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] 3114; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 3115; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] 3116; AVX2-NEXT: vmovdqa (%r9), %xmm0 3117; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3118; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3119; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 3120; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] 3121; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] 3122; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 3123; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3124; AVX2-NEXT: vpsrldq {{.*#+}} xmm1 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 3125; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3126; AVX2-NEXT: vpsrldq {{.*#+}} xmm2 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 3127; AVX2-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3128; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 3129; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,2,1] 3130; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3131; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] 3132; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,1,2,1] 3133; AVX2-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3134; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] 3135; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 3136; AVX2-NEXT: vmovdqa 32(%r9), %xmm4 3137; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 3138; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 3139; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 3140; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,3,3,4,5,6,7] 3141; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3142; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 3143; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 3144; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] 3145; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3146; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] 3147; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 3148; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 3149; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3150; AVX2-NEXT: vmovdqa 32(%rdx), %ymm2 3151; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3152; AVX2-NEXT: vmovdqa 32(%rcx), %ymm1 3153; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3154; AVX2-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 3155; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 3156; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] 3157; AVX2-NEXT: vmovdqa 32(%rsi), %ymm10 3158; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,1,2,3,6,5,6,7] 3159; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3160; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 3161; AVX2-NEXT: vmovdqa 32(%rdi), %ymm8 3162; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[2,1,2,3,6,5,6,7] 3163; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3164; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 3165; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 3166; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 3167; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 3168; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 3169; AVX2-NEXT: vmovdqa 32(%r8), %ymm2 3170; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3171; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 3172; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 3173; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 3174; AVX2-NEXT: vmovdqa 32(%r9), %ymm2 3175; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3176; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 3177; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 3178; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 3179; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 3180; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3181; AVX2-NEXT: vmovdqa (%rdx), %ymm2 3182; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3183; AVX2-NEXT: vmovdqa (%rcx), %ymm1 3184; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3185; AVX2-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 3186; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 3187; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] 3188; AVX2-NEXT: vmovdqa (%rsi), %ymm2 3189; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3190; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] 3191; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 3192; AVX2-NEXT: vmovdqa (%rdi), %ymm12 3193; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[2,1,2,3,6,5,6,7] 3194; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 3195; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 3196; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 3197; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 3198; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 3199; AVX2-NEXT: vmovdqa (%r8), %ymm2 3200; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3201; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 3202; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 3203; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 3204; AVX2-NEXT: vmovdqa (%r9), %ymm2 3205; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3206; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 3207; AVX2-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 3208; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] 3209; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm0 3210; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3211; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] 3212; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] 3213; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] 3214; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] 3215; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 3216; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 3217; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] 3218; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm6 3219; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 3220; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7] 3221; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] 3222; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] 3223; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,1] 3224; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 3225; AVX2-NEXT: vpblendvb %ymm3, %ymm6, %ymm14, %ymm0 3226; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3227; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3228; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] 3229; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] 3230; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 3231; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] 3232; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,2,3,3] 3233; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] 3234; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2],ymm14[3,4],ymm6[5],ymm14[6,7] 3235; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 3236; AVX2-NEXT: vpshufb %xmm1, %xmm11, %xmm1 3237; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 3238; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7] 3239; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 3240; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[2,3,2,3] 3241; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7] 3242; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 3243; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm6, %ymm1 3244; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3245; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] 3246; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] 3247; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3248; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload 3249; AVX2-NEXT: # ymm6 = ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15] 3250; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,2,3,3,5,6,7,7] 3251; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] 3252; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7] 3253; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] 3254; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3255; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm14 3256; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 3257; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6],ymm14[7] 3258; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload 3259; AVX2-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] 3260; AVX2-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 3261; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 3262; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm14, %ymm1 3263; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3264; AVX2-NEXT: vmovdqa %ymm3, %ymm14 3265; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3266; AVX2-NEXT: vmovdqa %ymm12, %ymm3 3267; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15] 3268; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] 3269; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3270; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3271; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15] 3272; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,2,3,3,5,6,7,7] 3273; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] 3274; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] 3275; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 3276; AVX2-NEXT: vpshufb %ymm6, %ymm8, %ymm6 3277; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] 3278; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7] 3279; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 3280; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[2,3,2,3,6,7,6,7] 3281; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 3282; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] 3283; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm6 3284; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] 3285; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] 3286; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 3287; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] 3288; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 3289; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 3290; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero 3291; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 3292; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 3293; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7] 3294; AVX2-NEXT: vpbroadcastq %xmm0, %ymm12 3295; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] 3296; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm12 3297; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3298; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 3299; AVX2-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3300; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3301; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3302; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3303; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 3304; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2] 3305; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 3306; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] 3307; AVX2-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3308; AVX2-NEXT: # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 3309; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 3310; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 3311; AVX2-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3312; AVX2-NEXT: # xmm1 = mem[0,0,2,1,4,5,6,7] 3313; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 3314; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1 3315; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3316; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3317; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] 3318; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3319; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload 3320; AVX2-NEXT: # ymm13 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11] 3321; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] 3322; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,0,2,2,5,4,6,6] 3323; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] 3324; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5,6],ymm13[7] 3325; AVX2-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload 3326; AVX2-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 3327; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] 3328; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] 3329; AVX2-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload 3330; AVX2-NEXT: # ymm9 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 3331; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] 3332; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm9, %ymm0 3333; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] 3334; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11] 3335; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] 3336; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,2,2,5,4,6,6] 3337; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] 3338; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6],ymm7[7] 3339; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 3340; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] 3341; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] 3342; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 3343; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] 3344; AVX2-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm2 3345; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 3346; AVX2-NEXT: vmovdqa %ymm2, 96(%rax) 3347; AVX2-NEXT: vmovdqa %ymm6, 160(%rax) 3348; AVX2-NEXT: vmovdqa %ymm0, 288(%rax) 3349; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3350; AVX2-NEXT: vmovaps %ymm0, 352(%rax) 3351; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3352; AVX2-NEXT: vmovaps %ymm0, 64(%rax) 3353; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3354; AVX2-NEXT: vmovaps %ymm0, 128(%rax) 3355; AVX2-NEXT: vmovdqa %ymm1, 192(%rax) 3356; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3357; AVX2-NEXT: vmovaps %ymm0, 256(%rax) 3358; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3359; AVX2-NEXT: vmovaps %ymm0, 320(%rax) 3360; AVX2-NEXT: vmovdqa %ymm12, (%rax) 3361; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3362; AVX2-NEXT: vmovaps %ymm0, 224(%rax) 3363; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3364; AVX2-NEXT: vmovaps %ymm0, 32(%rax) 3365; AVX2-NEXT: addq $616, %rsp # imm = 0x268 3366; AVX2-NEXT: vzeroupper 3367; AVX2-NEXT: retq 3368; 3369; AVX2-FP-LABEL: store_i16_stride6_vf32: 3370; AVX2-FP: # %bb.0: 3371; AVX2-FP-NEXT: subq $648, %rsp # imm = 0x288 3372; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1 3373; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3374; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm8 3375; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] 3376; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 3377; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2 3378; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3379; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm5 3380; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 3381; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 3382; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 3383; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm2 3384; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3385; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm6 3386; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 3387; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm3 3388; AVX2-FP-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill 3389; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm7 3390; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 3391; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 3392; AVX2-FP-NEXT: vpbroadcastq %xmm2, %ymm2 3393; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 3394; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 3395; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3396; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] 3397; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 3398; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 3399; AVX2-FP-NEXT: vmovdqa (%r9), %xmm3 3400; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3401; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] 3402; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 3403; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 3404; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] 3405; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 3406; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3407; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm2 3408; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3409; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm0 3410; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3411; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 3412; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 3413; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3414; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 3415; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3416; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 3417; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm9 3418; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 3419; AVX2-FP-NEXT: vpbroadcastq %xmm2, %ymm2 3420; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] 3421; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,1,3,3,4,5,6,7] 3422; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3423; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 3424; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] 3425; AVX2-FP-NEXT: vmovdqa 32(%r9), %xmm14 3426; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm1 3427; AVX2-FP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3428; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 3429; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 3430; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3431; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm11 3432; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm13 3433; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] 3434; AVX2-FP-NEXT: vpshufb %ymm0, %ymm13, %ymm1 3435; AVX2-FP-NEXT: vpshufb %ymm0, %ymm11, %ymm2 3436; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3437; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] 3438; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm12 3439; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm2 3440; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3441; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 3442; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 3443; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 3444; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 3445; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 3446; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 3447; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm2 3448; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3449; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 3450; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 3451; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 3452; AVX2-FP-NEXT: vmovdqa 32(%r9), %ymm15 3453; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] 3454; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1] 3455; AVX2-FP-NEXT: vpshufb %ymm4, %ymm15, %ymm2 3456; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3457; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 3458; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 3459; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3460; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 3461; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3462; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm1 3463; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3464; AVX2-FP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 3465; AVX2-FP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 3466; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 3467; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm3 3468; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3469; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm1 3470; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3471; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 3472; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 3473; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] 3474; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 3475; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 3476; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 3477; AVX2-FP-NEXT: vmovdqa (%r8), %ymm1 3478; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3479; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 3480; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 3481; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] 3482; AVX2-FP-NEXT: vmovdqa (%r9), %ymm1 3483; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3484; AVX2-FP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 3485; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 3486; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 3487; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3488; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] 3489; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] 3490; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 3491; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] 3492; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 3493; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 3494; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] 3495; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm4 3496; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 3497; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7] 3498; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] 3499; AVX2-FP-NEXT: vpshufb %xmm7, %xmm14, %xmm4 3500; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm4[0,1,0,1] 3501; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 3502; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm0 3503; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3504; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3505; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 3506; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 3507; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] 3508; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 3509; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload 3510; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 3511; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,2,3,3] 3512; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] 3513; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] 3514; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3515; AVX2-FP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 3516; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 3517; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 3518; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 3519; AVX2-FP-NEXT: vpshufb %xmm7, %xmm8, %xmm1 3520; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 3521; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 3522; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3523; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] 3524; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] 3525; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 3526; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[12],ymm10[12],ymm12[13],ymm10[13],ymm12[14],ymm10[14],ymm12[15],ymm10[15] 3527; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] 3528; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 3529; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 3530; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] 3531; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 3532; AVX2-FP-NEXT: vpshufb %ymm1, %ymm11, %ymm7 3533; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] 3534; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5,6],ymm7[7] 3535; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] 3536; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 3537; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm0 3538; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3539; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 3540; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3541; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm9[4],ymm7[5],ymm9[5],ymm7[6],ymm9[6],ymm7[7],ymm9[7],ymm7[12],ymm9[12],ymm7[13],ymm9[13],ymm7[14],ymm9[14],ymm7[15],ymm9[15] 3542; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] 3543; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 3544; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 3545; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15] 3546; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7] 3547; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] 3548; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7] 3549; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 3550; AVX2-FP-NEXT: vpshufb %ymm1, %ymm15, %ymm1 3551; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 3552; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 3553; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3554; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] 3555; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 3556; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 3557; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 3558; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3559; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 3560; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 3561; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 3562; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] 3563; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 3564; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 3565; AVX2-FP-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero 3566; AVX2-FP-NEXT: vpbroadcastq %xmm1, %ymm1 3567; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 3568; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] 3569; AVX2-FP-NEXT: vpbroadcastq %xmm0, %ymm4 3570; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] 3571; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm4 3572; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3573; AVX2-FP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3574; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 3575; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 3576; AVX2-FP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload 3577; AVX2-FP-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 3578; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 3579; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] 3580; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] 3581; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] 3582; AVX2-FP-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 3583; AVX2-FP-NEXT: # xmm15 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 3584; AVX2-FP-NEXT: vpbroadcastq %xmm15, %ymm15 3585; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3,4],ymm15[5],ymm1[6,7] 3586; AVX2-FP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 3587; AVX2-FP-NEXT: # xmm15 = mem[0,0,2,1,4,5,6,7] 3588; AVX2-FP-NEXT: vpbroadcastq %xmm15, %ymm15 3589; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm15, %ymm1 3590; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3591; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[2],ymm13[2],ymm2[3],ymm13[3],ymm2[8],ymm13[8],ymm2[9],ymm13[9],ymm2[10],ymm13[10],ymm2[11],ymm13[11] 3592; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[8],ymm10[8],ymm12[9],ymm10[9],ymm12[10],ymm10[10],ymm12[11],ymm10[11] 3593; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm15[2,2,2,3] 3594; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,0,2,2,5,4,6,6] 3595; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] 3596; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6],ymm12[7] 3597; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 3598; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] 3599; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] 3600; AVX2-FP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload 3601; AVX2-FP-NEXT: # ymm10 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 3602; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] 3603; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm10 3604; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[8],ymm9[8],ymm7[9],ymm9[9],ymm7[10],ymm9[10],ymm7[11],ymm9[11] 3605; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3606; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm14[0],ymm2[1],ymm14[1],ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[8],ymm14[8],ymm2[9],ymm14[9],ymm2[10],ymm14[10],ymm2[11],ymm14[11] 3607; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,2,2,3] 3608; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,2,2,5,4,6,6] 3609; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] 3610; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] 3611; AVX2-FP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 3612; AVX2-FP-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 3613; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] 3614; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] 3615; AVX2-FP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 3616; AVX2-FP-NEXT: # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 3617; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 3618; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm0 3619; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3620; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax) 3621; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3622; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rax) 3623; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3624; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rax) 3625; AVX2-FP-NEXT: vmovdqa %ymm10, 288(%rax) 3626; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3627; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rax) 3628; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3629; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rax) 3630; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3631; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax) 3632; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3633; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax) 3634; AVX2-FP-NEXT: vmovdqa %ymm1, 192(%rax) 3635; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3636; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rax) 3637; AVX2-FP-NEXT: vmovdqa %ymm4, (%rax) 3638; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3639; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) 3640; AVX2-FP-NEXT: addq $648, %rsp # imm = 0x288 3641; AVX2-FP-NEXT: vzeroupper 3642; AVX2-FP-NEXT: retq 3643; 3644; AVX2-FCP-LABEL: store_i16_stride6_vf32: 3645; AVX2-FCP: # %bb.0: 3646; AVX2-FCP-NEXT: subq $648, %rsp # imm = 0x288 3647; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 3648; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3649; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm14 3650; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] 3651; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 3652; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2 3653; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3654; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 3655; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 3656; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 3657; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 3658; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm2 3659; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3660; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm6 3661; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 3662; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm3 3663; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3664; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm7 3665; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 3666; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 3667; AVX2-FCP-NEXT: vpbroadcastq %xmm2, %ymm2 3668; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 3669; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 3670; AVX2-FCP-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill 3671; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] 3672; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 3673; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 3674; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm3 3675; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3676; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] 3677; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 3678; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 3679; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] 3680; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 3681; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3682; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm1 3683; AVX2-FCP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3684; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 3685; AVX2-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3686; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 3687; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 3688; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3689; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 3690; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3691; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 3692; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm8 3693; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 3694; AVX2-FCP-NEXT: vpbroadcastq %xmm1, %ymm1 3695; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 3696; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,1,3,3,4,5,6,7] 3697; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3698; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 3699; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] 3700; AVX2-FCP-NEXT: vmovdqa 32(%r9), %xmm9 3701; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm1 3702; AVX2-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3703; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 3704; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 3705; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3706; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 3707; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3708; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm1 3709; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3710; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] 3711; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 3712; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 3713; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] 3714; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm10 3715; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm4 3716; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 3717; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3718; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 3719; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3720; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 3721; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 3722; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 3723; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 3724; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm2 3725; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3726; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 3727; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 3728; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 3729; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm3 3730; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3731; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] 3732; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] 3733; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm3 3734; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] 3735; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 3736; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3737; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 3738; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3739; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm1 3740; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3741; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 3742; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 3743; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 3744; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm5 3745; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm1 3746; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3747; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 3748; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 3749; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] 3750; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 3751; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 3752; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 3753; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm1 3754; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3755; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 3756; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 3757; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] 3758; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm1 3759; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3760; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm2 3761; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 3762; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0 3763; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3764; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 3765; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,2,1,2,0,0,3,3] 3766; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 3767; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] 3768; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] 3769; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] 3770; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] 3771; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm13 3772; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] 3773; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5,6],ymm13[7] 3774; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] 3775; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm1 3776; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,0,1] 3777; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 3778; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm13, %ymm0, %ymm0 3779; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3780; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 3781; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3782; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] 3783; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 3784; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 3785; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3786; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] 3787; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] 3788; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] 3789; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload 3790; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm2 3791; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 3792; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] 3793; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 3794; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm2 3795; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 3796; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 3797; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3798; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm4[4],ymm10[5],ymm4[5],ymm10[6],ymm4[6],ymm10[7],ymm4[7],ymm10[12],ymm4[12],ymm10[13],ymm4[13],ymm10[14],ymm4[14],ymm10[15],ymm4[15] 3799; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [5,6,5,6,5,6,7,7] 3800; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 3801; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 3802; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 3803; AVX2-FCP-NEXT: # ymm3 = ymm3[4],mem[4],ymm3[5],mem[5],ymm3[6],mem[6],ymm3[7],mem[7],ymm3[12],mem[12],ymm3[13],mem[13],ymm3[14],mem[14],ymm3[15],mem[15] 3804; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] 3805; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] 3806; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] 3807; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3808; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm6 3809; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] 3810; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7] 3811; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] 3812; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3813; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm13 3814; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] 3815; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm13, %ymm0 3816; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm13 3817; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3818; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 3819; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] 3820; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 3821; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 3822; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3823; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] 3824; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] 3825; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] 3826; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 3827; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm2 3828; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 3829; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] 3830; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 3831; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm2 3832; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 3833; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm13 3834; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] 3835; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] 3836; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2] 3837; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] 3838; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 3839; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 3840; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] 3841; AVX2-FCP-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero 3842; AVX2-FCP-NEXT: vpbroadcastq %xmm1, %ymm1 3843; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 3844; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] 3845; AVX2-FCP-NEXT: vpbroadcastq %xmm0, %ymm2 3846; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] 3847; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm2 3848; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3849; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3850; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 3851; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 3852; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 3853; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 3854; AVX2-FCP-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] 3855; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 3856; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] 3857; AVX2-FCP-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3858; AVX2-FCP-NEXT: # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 3859; AVX2-FCP-NEXT: vpbroadcastq %xmm3, %ymm3 3860; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] 3861; AVX2-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3862; AVX2-FCP-NEXT: # xmm3 = mem[0,0,2,1,4,5,6,7] 3863; AVX2-FCP-NEXT: vpbroadcastq %xmm3, %ymm3 3864; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 3865; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 3866; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 3867; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] 3868; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3869; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload 3870; AVX2-FCP-NEXT: # ymm14 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11] 3871; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [5,4,2,2,5,4,6,6] 3872; AVX2-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm3 3873; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] 3874; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6],ymm3[7] 3875; AVX2-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload 3876; AVX2-FCP-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 3877; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] 3878; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7] 3879; AVX2-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload 3880; AVX2-FCP-NEXT: # ymm11 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 3881; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] 3882; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm3, %ymm11, %ymm3 3883; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] 3884; AVX2-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm7 3885; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] 3886; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] 3887; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] 3888; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 3889; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] 3890; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] 3891; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 3892; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] 3893; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm0 3894; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3895; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax) 3896; AVX2-FCP-NEXT: vmovdqa %ymm13, 160(%rax) 3897; AVX2-FCP-NEXT: vmovdqa %ymm3, 288(%rax) 3898; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3899; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%rax) 3900; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3901; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax) 3902; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3903; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rax) 3904; AVX2-FCP-NEXT: vmovdqa %ymm1, 192(%rax) 3905; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3906; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rax) 3907; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3908; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rax) 3909; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rax) 3910; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3911; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax) 3912; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3913; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) 3914; AVX2-FCP-NEXT: addq $648, %rsp # imm = 0x288 3915; AVX2-FCP-NEXT: vzeroupper 3916; AVX2-FCP-NEXT: retq 3917; 3918; AVX512-LABEL: store_i16_stride6_vf32: 3919; AVX512: # %bb.0: 3920; AVX512-NEXT: vmovdqa 32(%rcx), %ymm4 3921; AVX512-NEXT: vpsrldq {{.*#+}} ymm0 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 3922; AVX512-NEXT: vmovdqa 32(%rdx), %ymm8 3923; AVX512-NEXT: vpsrldq {{.*#+}} ymm1 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 3924; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 3925; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 3926; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] 3927; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] 3928; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 3929; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 3930; AVX512-NEXT: vmovdqa 32(%rsi), %ymm9 3931; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,1,2,3,6,5,6,7] 3932; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 3933; AVX512-NEXT: vmovdqa 32(%rdi), %ymm11 3934; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[2,1,2,3,6,5,6,7] 3935; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 3936; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] 3937; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 3938; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15] 3939; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] 3940; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm2 3941; AVX512-NEXT: movw $18724, %ax # imm = 0x4924 3942; AVX512-NEXT: kmovw %eax, %k1 3943; AVX512-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 3944; AVX512-NEXT: vextracti64x4 $1, %zmm2, %ymm0 3945; AVX512-NEXT: vmovdqa 32(%r8), %ymm12 3946; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] 3947; AVX512-NEXT: vpshufb %ymm1, %ymm12, %ymm3 3948; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] 3949; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] 3950; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 3951; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] 3952; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] 3953; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm16 3954; AVX512-NEXT: vmovdqa (%rcx), %ymm2 3955; AVX512-NEXT: vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 3956; AVX512-NEXT: vmovdqa (%rdx), %ymm3 3957; AVX512-NEXT: vpsrldq {{.*#+}} ymm5 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 3958; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11] 3959; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 3960; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] 3961; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] 3962; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] 3963; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 3964; AVX512-NEXT: vmovdqa (%rsi), %ymm0 3965; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[2,1,2,3,6,5,6,7] 3966; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 3967; AVX512-NEXT: vmovdqa (%rdi), %ymm10 3968; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm10[2,1,2,3,6,5,6,7] 3969; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 3970; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] 3971; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] 3972; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm10[4],ymm0[4],ymm10[5],ymm0[5],ymm10[6],ymm0[6],ymm10[7],ymm0[7],ymm10[12],ymm0[12],ymm10[13],ymm0[13],ymm10[14],ymm0[14],ymm10[15],ymm0[15] 3973; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,3,3,3] 3974; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 3975; AVX512-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} 3976; AVX512-NEXT: vextracti64x4 $1, %zmm6, %ymm5 3977; AVX512-NEXT: vmovdqa (%r8), %ymm14 3978; AVX512-NEXT: vpshufb %ymm1, %ymm14, %ymm1 3979; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 3980; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7] 3981; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 3982; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] 3983; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] 3984; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm17 3985; AVX512-NEXT: vmovdqa (%rcx), %xmm5 3986; AVX512-NEXT: vmovdqa 32(%rcx), %xmm13 3987; AVX512-NEXT: vmovdqa (%rdx), %xmm6 3988; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 3989; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 3990; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] 3991; AVX512-NEXT: vpermt2d %zmm1, %zmm18, %zmm2 3992; AVX512-NEXT: vmovdqa (%rsi), %xmm3 3993; AVX512-NEXT: vmovdqa (%rdi), %xmm7 3994; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 3995; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] 3996; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[8],ymm0[8],ymm10[9],ymm0[9],ymm10[10],ymm0[10],ymm10[11],ymm0[11] 3997; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] 3998; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 3999; AVX512-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 4000; AVX512-NEXT: vextracti64x4 $1, %zmm2, %ymm0 4001; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 4002; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 4003; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 4004; AVX512-NEXT: vmovdqa (%r8), %xmm10 4005; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] 4006; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm0 4007; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 4008; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] 4009; AVX512-NEXT: vmovdqa 32(%rdx), %xmm0 4010; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm2 4011; AVX512-NEXT: vmovdqa 32(%rsi), %xmm14 4012; AVX512-NEXT: vmovdqa 32(%rdi), %xmm15 4013; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] 4014; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] 4015; AVX512-NEXT: vpermt2d %zmm8, %zmm18, %zmm4 4016; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] 4017; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] 4018; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11] 4019; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] 4020; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 4021; AVX512-NEXT: vmovdqa32 %zmm8, %zmm4 {%k1} 4022; AVX512-NEXT: vextracti64x4 $1, %zmm4, %ymm8 4023; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 4024; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] 4025; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] 4026; AVX512-NEXT: vmovdqa 32(%r8), %xmm12 4027; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm1 4028; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 4029; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6],ymm1[7] 4030; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm4 4031; AVX512-NEXT: vmovdqa 32(%r9), %ymm11 4032; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 4033; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 4034; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,2,2,3] 4035; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[2,3,2,3,6,7,6,7] 4036; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 4037; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm1[2,1,2,3] 4038; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] 4039; AVX512-NEXT: vpsrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 4040; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 4041; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] 4042; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] 4043; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 4044; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,1,2,1] 4045; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] 4046; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[0,1,2,1] 4047; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5] 4048; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] 4049; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] 4050; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] 4051; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm13[0,1,2,3],zmm0[0,1,0,1] 4052; AVX512-NEXT: movw $9362, %ax # imm = 0x2492 4053; AVX512-NEXT: kmovw %eax, %k1 4054; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} 4055; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 4056; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[2,1,3,3,4,5,6,7] 4057; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] 4058; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm1[1,2],ymm13[3],ymm1[4,5],ymm13[6],ymm1[7] 4059; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero 4060; AVX512-NEXT: vpbroadcastq %xmm12, %ymm12 4061; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] 4062; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 4063; AVX512-NEXT: vmovdqa (%r9), %ymm0 4064; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 4065; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 4066; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,2,2,3] 4067; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] 4068; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 4069; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm1[2,1,2,3] 4070; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 4071; AVX512-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 4072; AVX512-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 4073; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 4074; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm1 4075; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,1,2,1] 4076; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] 4077; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,1,2,1] 4078; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] 4079; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 4080; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] 4081; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 4082; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm5[0,1,0,1] 4083; AVX512-NEXT: vmovdqa32 %zmm1, %zmm3 {%k1} 4084; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm1 4085; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[2,1,3,3,4,5,6,7] 4086; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] 4087; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] 4088; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero 4089; AVX512-NEXT: vpbroadcastq %xmm5, %ymm5 4090; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] 4091; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 4092; AVX512-NEXT: vmovdqa (%r9), %xmm3 4093; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] 4094; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7] 4095; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 4096; AVX512-NEXT: vmovdqa 32(%r9), %xmm6 4097; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 4098; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 4099; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] 4100; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,1,4,5,6,7] 4101; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] 4102; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 4103; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] 4104; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[0,0,2,1,4,5,6,7] 4105; AVX512-NEXT: vpbroadcastq %xmm11, %ymm11 4106; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] 4107; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] 4108; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] 4109; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[0,0,2,1,4,5,6,7] 4110; AVX512-NEXT: vpbroadcastq %xmm15, %ymm15 4111; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] 4112; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 4113; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 4114; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 4115; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 4116; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 4117; AVX512-NEXT: vpternlogd {{.*#+}} zmm8 = zmm8 ^ (zmm9 & (zmm8 ^ zmm16)) 4118; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 4119; AVX512-NEXT: vpternlogd {{.*#+}} zmm13 = zmm13 ^ (zmm9 & (zmm13 ^ zmm17)) 4120; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 4121; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] 4122; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (zmm5 & (zmm0 ^ zmm2)) 4123; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm2 4124; AVX512-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (zmm5 & (zmm2 ^ zmm4)) 4125; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm4 4126; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] 4127; AVX512-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (zmm5 & (zmm4 ^ zmm12)) 4128; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3 4129; AVX512-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (zmm5 & (zmm3 ^ zmm1)) 4130; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) 4131; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rax) 4132; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rax) 4133; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax) 4134; AVX512-NEXT: vmovdqa64 %zmm13, 128(%rax) 4135; AVX512-NEXT: vmovdqa64 %zmm8, 320(%rax) 4136; AVX512-NEXT: vzeroupper 4137; AVX512-NEXT: retq 4138; 4139; AVX512-FCP-LABEL: store_i16_stride6_vf32: 4140; AVX512-FCP: # %bb.0: 4141; AVX512-FCP-NEXT: pushq %rax 4142; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 4143; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] 4144; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm1 4145; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 4146; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm3 4147; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] 4148; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] 4149; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm23 4150; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm24 4151; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,1,2,3,11,11,11,11] 4152; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 4153; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm11 4154; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm13 4155; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] 4156; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [5,6,5,6,5,6,7,7] 4157; AVX512-FCP-NEXT: vpermd %ymm3, %ymm10, %ymm3 4158; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 4159; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm5 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 4160; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] 4161; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] 4162; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 4163; AVX512-FCP-NEXT: movw $18724, %ax # imm = 0x4924 4164; AVX512-FCP-NEXT: kmovw %eax, %k1 4165; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} 4166; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm2 4167; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 4168; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [12,1,2,13,4,5,14,7] 4169; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm5 4170; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm12, %ymm5 4171; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [8,21,10,11,20,13,14,23] 4172; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] 4173; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm3 4174; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 4175; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm17, %zmm1 4176; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm16 4177; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm3 4178; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] 4179; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm1 4180; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] 4181; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] 4182; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm2 4183; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 4184; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [2,2,0,3,10,0,10,11] 4185; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2 4186; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4187; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm3 4188; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm1 4189; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 4190; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 4191; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 4192; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] 4193; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 4194; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 4195; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm1 4196; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm6 4197; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[12],ymm1[12],ymm6[13],ymm1[13],ymm6[14],ymm1[14],ymm6[15],ymm1[15] 4198; AVX512-FCP-NEXT: vpermd %ymm2, %ymm10, %ymm2 4199; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm10 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 4200; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 4201; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm10[0],ymm4[1],ymm10[1],ymm4[2],ymm10[2],ymm4[3],ymm10[3],ymm4[8],ymm10[8],ymm4[9],ymm10[9],ymm4[10],ymm10[10],ymm4[11],ymm10[11] 4202; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] 4203; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 4204; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} 4205; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm10 4206; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 4207; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm0, %ymm12 4208; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm2 4209; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm0 4210; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm17 4211; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm12 4212; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm12, %ymm0 4213; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm2 4214; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 4215; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4216; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm14 4217; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm15 4218; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[8],ymm1[8],ymm6[9],ymm1[9],ymm6[10],ymm1[10],ymm6[11],ymm1[11] 4219; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] 4220; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm18 4221; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm1 4222; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm6 4223; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] 4224; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] 4225; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm20 4226; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm13 4227; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] 4228; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm4 4229; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 4230; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm5 4231; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 4232; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,2,1,8,9,8,9] 4233; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] 4234; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm11 4235; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] 4236; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] 4237; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] 4238; AVX512-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm8 4239; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 4240; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 4241; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] 4242; AVX512-FCP-NEXT: vpbroadcastq %xmm1, %ymm1 4243; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 4244; AVX512-FCP-NEXT: movw $9362, %ax # imm = 0x2492 4245; AVX512-FCP-NEXT: kmovw %eax, %k2 4246; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm11 {%k2} 4247; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm6 4248; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [16,9,10,17,12,13,18,15] 4249; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm1 4250; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[2,1,3,3,4,5,6,7] 4251; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm11 4252; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 4253; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [0,1,8,3,4,9,6,7] 4254; AVX512-FCP-NEXT: vpermt2d %ymm9, %ymm21, %ymm6 4255; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm19 4256; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm9 4257; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm11 4258; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm6 4259; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm2 4260; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 4261; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] 4262; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm5 4263; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] 4264; AVX512-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm2 4265; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm4 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 4266; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm6 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 4267; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] 4268; AVX512-FCP-NEXT: vpbroadcastq %xmm4, %ymm4 4269; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 4270; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm5 {%k2} 4271; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 4272; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 4273; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm21 4274; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7] 4275; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm5 4276; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,8,8,0,9] 4277; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm6 4278; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] 4279; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm15 4280; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm6[0,0,2,1,4,5,6,7] 4281; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm14 4282; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm21, %zmm21 4283; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm5 4284; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 4285; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm5[0,0,2,1,4,5,6,7] 4286; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm15 4287; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm4 4288; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] 4289; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] 4290; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [1,1,1,1,10,10,10,11] 4291; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm8 4292; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14] 4293; AVX512-FCP-NEXT: vpermd %zmm18, %zmm4, %zmm11 4294; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} 4295; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] 4296; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 4297; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,9,2,3,8,5,6,11] 4298; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm22 4299; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm18, %ymm22 4300; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,9,20,11,12,21,14,15] 4301; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 4302; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm2, %zmm11 4303; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 4304; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] 4305; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm8 4306; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,1,0,10,10,0] 4307; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm5, %zmm8 4308; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm10 4309; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm7 4310; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11] 4311; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] 4312; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm9, %zmm0 4313; AVX512-FCP-NEXT: vpermd %zmm20, %zmm4, %zmm4 4314; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} 4315; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm0 4316; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm4, %ymm18 4317; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 4318; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 4319; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 4320; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm0 4321; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm1 4322; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 4323; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 4324; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm18, %zmm1 4325; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] 4326; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1)) 4327; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4328; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 256(%rax) 4329; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm22, %zmm0 4330; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm8 = zmm8 ^ (zmm2 & (zmm8 ^ zmm0)) 4331; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) 4332; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] 4333; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm15 = zmm15 ^ (zmm0 & (zmm15 ^ zmm21)) 4334; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rax) 4335; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm14 = zmm14 ^ (zmm0 & (zmm14 ^ zmm19)) 4336; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 192(%rax) 4337; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 4338; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 4339; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ zmm17)) 4340; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) 4341; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 4342; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ zmm16)) 4343; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) 4344; AVX512-FCP-NEXT: popq %rax 4345; AVX512-FCP-NEXT: vzeroupper 4346; AVX512-FCP-NEXT: retq 4347; 4348; AVX512DQ-LABEL: store_i16_stride6_vf32: 4349; AVX512DQ: # %bb.0: 4350; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm6 4351; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm2 4352; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm9 4353; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm3 4354; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 4355; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 4356; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm25 4357; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 4358; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm26 4359; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 4360; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] 4361; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 4362; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm8 4363; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm4 4364; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1] 4365; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] 4366; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm10 4367; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm5 4368; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1] 4369; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] 4370; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 4371; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 4372; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm27 4373; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm28 4374; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 4375; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1] 4376; AVX512DQ-NEXT: movw $9362, %ax # imm = 0x2492 4377; AVX512DQ-NEXT: kmovw %eax, %k1 4378; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 4379; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 4380; AVX512DQ-NEXT: vmovdqa (%r8), %xmm11 4381; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm4 4382; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] 4383; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 4384; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] 4385; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 4386; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm29 4387; AVX512DQ-NEXT: vpbroadcastq %xmm3, %ymm3 4388; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] 4389; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm16 4390; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] 4391; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 4392; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm24 4393; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm3 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 4394; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 4395; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 4396; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,2,1] 4397; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] 4398; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,2,1] 4399; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] 4400; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 4401; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] 4402; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm31 4403; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 4404; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,0,1] 4405; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 4406; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 4407; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,3,3,4,5,6,7] 4408; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 4409; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] 4410; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero 4411; AVX512DQ-NEXT: vpbroadcastq %xmm2, %ymm2 4412; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 4413; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm17 4414; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm13 4415; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[2,1,2,3,6,5,6,7] 4416; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 4417; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm14 4418; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,1,2,3,6,5,6,7] 4419; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 4420; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 4421; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 4422; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15] 4423; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] 4424; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm15 4425; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm2 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 4426; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm7 4427; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 4428; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 4429; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4430; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,2,2,2] 4431; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm15[4],ymm7[5],ymm15[5],ymm7[6],ymm15[6],ymm7[7],ymm15[7],ymm7[12],ymm15[12],ymm7[13],ymm15[13],ymm7[14],ymm15[14],ymm7[15],ymm15[15] 4432; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] 4433; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 4434; AVX512DQ-NEXT: movw $18724, %ax # imm = 0x4924 4435; AVX512DQ-NEXT: kmovw %eax, %k1 4436; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1} 4437; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm6 4438; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] 4439; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm1 4440; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 4441; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 4442; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] 4443; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 4444; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 4445; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] 4446; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm22 4447; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm4 4448; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,1,2,3,6,5,6,7] 4449; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 4450; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 4451; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7] 4452; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 4453; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 4454; AVX512DQ-NEXT: vpermq {{.*#+}} ymm18 = ymm0[2,1,2,3] 4455; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] 4456; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm0[3,3,3,3] 4457; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm2 4458; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 4459; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm0 4460; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm8 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 4461; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] 4462; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 4463; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 4464; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,2,3,3,5,6,7,7] 4465; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] 4466; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm18, %zmm12 4467; AVX512DQ-NEXT: vinserti32x8 $1, %ymm8, %zmm1, %zmm12 {%k1} 4468; AVX512DQ-NEXT: vmovdqa (%r8), %ymm1 4469; AVX512DQ-NEXT: vpshufb %ymm5, %ymm1, %ymm5 4470; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] 4471; AVX512DQ-NEXT: vextracti64x4 $1, %zmm12, %ymm8 4472; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6],ymm5[7] 4473; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 4474; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] 4475; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7] 4476; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm18 4477; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm12 4478; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,0,2,1,4,5,6,7] 4479; AVX512DQ-NEXT: vpbroadcastq %xmm5, %ymm19 4480; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,2,2,3,4,5,6,7] 4481; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] 4482; AVX512DQ-NEXT: vpermq {{.*#+}} ymm20 = ymm5[0,0,2,1] 4483; AVX512DQ-NEXT: vmovdqa (%r9), %xmm5 4484; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[0,0,2,1,4,5,6,7] 4485; AVX512DQ-NEXT: vpbroadcastq %xmm8, %ymm21 4486; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7] 4487; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] 4488; AVX512DQ-NEXT: vpermq {{.*#+}} ymm23 = ymm8[0,0,2,1] 4489; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm8 4490; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 4491; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 4492; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] 4493; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm24, %zmm0 4494; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm2 4495; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] 4496; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] 4497; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] 4498; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] 4499; AVX512DQ-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1} 4500; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 4501; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 4502; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 4503; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 4504; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] 4505; AVX512DQ-NEXT: vpshufb %xmm3, %xmm11, %xmm2 4506; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 4507; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] 4508; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 4509; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm1 4510; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 4511; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 4512; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] 4513; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[2,3,2,3,6,7,6,7] 4514; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 4515; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] 4516; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm8 4517; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm9 4518; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 4519; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm15[0],ymm7[1],ymm15[1],ymm7[2],ymm15[2],ymm7[3],ymm15[3],ymm7[8],ymm15[8],ymm7[9],ymm15[9],ymm7[10],ymm15[10],ymm7[11],ymm15[11] 4520; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm24, %zmm7 4521; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm8 4522; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm9 4523; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] 4524; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] 4525; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11] 4526; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] 4527; AVX512DQ-NEXT: vinserti32x8 $1, %ymm9, %zmm8, %zmm7 {%k1} 4528; AVX512DQ-NEXT: vextracti64x4 $1, %zmm7, %ymm8 4529; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 4530; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] 4531; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] 4532; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm8 4533; AVX512DQ-NEXT: vpshufb %xmm3, %xmm8, %xmm3 4534; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] 4535; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6],ymm3[7] 4536; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 4537; AVX512DQ-NEXT: vmovdqa (%r9), %ymm6 4538; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 4539; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 4540; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] 4541; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[2,3,2,3,6,7,6,7] 4542; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 4543; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] 4544; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] 4545; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7] 4546; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 4547; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 4548; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] 4549; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[2,3,2,3] 4550; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,1,4,5,6,7] 4551; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 4552; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 4553; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 4554; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 4555; AVX512DQ-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm10 4556; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] 4557; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm10 = zmm10 ^ (zmm11 & (zmm10 ^ zmm16)) 4558; AVX512DQ-NEXT: vinserti64x4 $1, %ymm23, %zmm21, %zmm12 4559; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm12 = zmm12 ^ (zmm11 & (zmm12 ^ zmm17)) 4560; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 4561; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 4562; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (zmm4 & (zmm0 ^ zmm22)) 4563; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 4564; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (zmm4 & (zmm7 ^ zmm18)) 4565; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm4 4566; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] 4567; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (zmm5 & (zmm4 ^ zmm2)) 4568; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 4569; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm3)) 4570; AVX512DQ-NEXT: vmovdqa64 %zmm1, 256(%rax) 4571; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rax) 4572; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rax) 4573; AVX512DQ-NEXT: vmovdqa64 %zmm10, 192(%rax) 4574; AVX512DQ-NEXT: vmovdqa64 %zmm7, 128(%rax) 4575; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rax) 4576; AVX512DQ-NEXT: vzeroupper 4577; AVX512DQ-NEXT: retq 4578; 4579; AVX512DQ-FCP-LABEL: store_i16_stride6_vf32: 4580; AVX512DQ-FCP: # %bb.0: 4581; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm1 4582; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 4583; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] 4584; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm31 4585; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 4586; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2 4587; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4588; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 4589; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4590; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 4591; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [1,1,1,1,10,10,10,11] 4592; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 4593; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm2 4594; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm3 4595; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 4596; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 4597; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 4598; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3 4599; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4600; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm5 4601; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 4602; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4603; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm6 4604; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 4605; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 4606; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14] 4607; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm20, %zmm19 4608; AVX512DQ-FCP-NEXT: movw $18724, %ax # imm = 0x4924 4609; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 4610; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm19 {%k1} 4611; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm0 4612; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4613; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] 4614; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm1 4615; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,9,2,3,8,5,6,11] 4616; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm23 4617; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm17, %ymm23 4618; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [8,9,20,11,12,21,14,15] 4619; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm9 4620; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm4 4621; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11] 4622; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 4623; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm30 4624; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm29 4625; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm5 4626; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm7 4627; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 4628; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm21, %zmm19 4629; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm8 4630; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,1,0,10,10,0] 4631; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 4632; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm2 4633; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] 4634; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm15 4635; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm28 4636; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm15 4637; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm11 4638; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 4639; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 4640; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 4641; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 4642; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] 4643; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm13 4644; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm20, %zmm20 4645; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm13, %zmm20 {%k1} 4646; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm12 4647; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm3 4648; AVX512DQ-FCP-NEXT: vpermi2d %ymm3, %ymm20, %ymm17 4649; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm3 4650; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 4651; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm21, %zmm20 4652; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm5 4653; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm13 4654; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm24 4655; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm5 4656; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 4657; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 4658; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] 4659; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm0 4660; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm14 4661; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[8],ymm0[8],ymm14[9],ymm0[9],ymm14[10],ymm0[10],ymm14[11],ymm0[11] 4662; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 4663; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,1,2,3,11,11,11,11] 4664; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 4665; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [5,6,5,6,5,6,7,7] 4666; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm9[4],ymm4[5],ymm9[5],ymm4[6],ymm9[6],ymm4[7],ymm9[7],ymm4[12],ymm9[12],ymm4[13],ymm9[13],ymm4[14],ymm9[14],ymm4[15],ymm9[15] 4667; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm21, %ymm1 4668; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm9 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 4669; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 4670; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11] 4671; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] 4672; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm4, %zmm0 {%k1} 4673; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [12,1,2,13,4,5,14,7] 4674; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm9 4675; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 4676; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm4, %ymm9 4677; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] 4678; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 4679; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm16 4680; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [8,21,10,11,20,13,14,23] 4681; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm22, %zmm0 4682; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm18 4683; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm9 4684; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm0 4685; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 4686; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm3 4687; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] 4688; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm9[4],ymm1[5],ymm9[5],ymm1[6],ymm9[6],ymm1[7],ymm9[7],ymm1[12],ymm9[12],ymm1[13],ymm9[13],ymm1[14],ymm9[14],ymm1[15],ymm9[15] 4689; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 4690; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] 4691; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm3 4692; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] 4693; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] 4694; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm14 4695; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,2,0,3,10,0,10,11] 4696; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm14 4697; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm1 4698; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm9 4699; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm9[4],ymm1[5],ymm9[5],ymm1[6],ymm9[6],ymm1[7],ymm9[7],ymm1[12],ymm9[12],ymm1[13],ymm9[13],ymm1[14],ymm9[14],ymm1[15],ymm9[15] 4700; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm21, %ymm3 4701; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm9 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 4702; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 4703; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[8],ymm9[8],ymm1[9],ymm9[9],ymm1[10],ymm9[10],ymm1[11],ymm9[11] 4704; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 4705; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm3, %zmm1, %zmm0 {%k1} 4706; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 4707; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 4708; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm1 4709; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm1 4710; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm22, %zmm0 4711; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 4712; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm2 4713; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm1 4714; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 4715; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] 4716; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm3 4717; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm4 4718; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 4719; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] 4720; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,2,1,8,9,8,9] 4721; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 4722; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] 4723; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] 4724; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm7 4725; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm8 4726; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 4727; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm4, %ymm6 4728; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 4729; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 4730; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 4731; AVX512DQ-FCP-NEXT: vpbroadcastq %xmm7, %ymm7 4732; AVX512DQ-FCP-NEXT: movw $9362, %ax # imm = 0x2492 4733; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 4734; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm7, %zmm6, %zmm3 {%k1} 4735; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero 4736; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm7 4737; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [16,9,10,17,12,13,18,15] 4738; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[2,1,3,3,4,5,6,7] 4739; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm3 4740; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,8,3,4,9,6,7] 4741; AVX512DQ-FCP-NEXT: vpermt2d %ymm6, %ymm9, %ymm7 4742; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm3 4743; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 4744; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm6 4745; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4746; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm2 4747; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 4748; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] 4749; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] 4750; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 4751; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm2 4752; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm5 4753; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7] 4754; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,8,8,0,9] 4755; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 4756; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 4757; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 4758; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] 4759; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm4 4760; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm5 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 4761; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm11 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 4762; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3] 4763; AVX512DQ-FCP-NEXT: vpbroadcastq %xmm5, %ymm5 4764; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm5, %zmm4, %zmm6 {%k1} 4765; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 4766; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero 4767; AVX512DQ-FCP-NEXT: vpermi2d %ymm4, %ymm6, %ymm9 4768; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,3,3,4,5,6,7] 4769; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 4770; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm4 4771; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm6 4772; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm5 4773; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,2,1,4,5,6,7] 4774; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm6 4775; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] 4776; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (zmm5 & (zmm6 ^ zmm4)) 4777; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4778; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rax) 4779; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (zmm5 & (zmm2 ^ zmm3)) 4780; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) 4781; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 4782; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 ^ (zmm2 & (zmm1 ^ zmm0)) 4783; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) 4784; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm14 = zmm14 ^ (zmm2 & (zmm14 ^ zmm18)) 4785; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 320(%rax) 4786; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm0 4787; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] 4788; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm13 = zmm13 ^ (zmm1 & (zmm13 ^ zmm0)) 4789; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) 4790; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm23, %zmm0 4791; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm15 = zmm15 ^ (zmm1 & (zmm15 ^ zmm0)) 4792; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 64(%rax) 4793; AVX512DQ-FCP-NEXT: vzeroupper 4794; AVX512DQ-FCP-NEXT: retq 4795; 4796; AVX512BW-LABEL: store_i16_stride6_vf32: 4797; AVX512BW: # %bb.0: 4798; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 4799; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 4800; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 4801; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 4802; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm5 4803; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 4804; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm0 4805; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] 4806; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 4807; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm6 4808; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] 4809; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4810; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 4811; AVX512BW-NEXT: movw $9362, %cx # imm = 0x2492 4812; AVX512BW-NEXT: kmovd %ecx, %k2 4813; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} 4814; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] 4815; AVX512BW-NEXT: vpermi2w %zmm1, %zmm7, %zmm8 4816; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] 4817; AVX512BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm6 4818; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] 4819; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4820; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 4821; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] 4822; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 4823; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 4824; AVX512BW-NEXT: movw $18724, %cx # imm = 0x4924 4825; AVX512BW-NEXT: kmovd %ecx, %k1 4826; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} 4827; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] 4828; AVX512BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm7 4829; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] 4830; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm8 4831; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] 4832; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4833; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 4834; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] 4835; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 4836; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 4837; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm9 {%k1} 4838; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] 4839; AVX512BW-NEXT: vpermi2w %zmm1, %zmm9, %zmm7 4840; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] 4841; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm9 4842; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] 4843; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4844; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 4845; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] 4846; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 4847; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 4848; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm10 {%k2} 4849; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] 4850; AVX512BW-NEXT: vpermi2w %zmm1, %zmm10, %zmm7 4851; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] 4852; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm10 4853; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] 4854; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4855; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 4856; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] 4857; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 4858; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 4859; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm11 {%k1} 4860; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] 4861; AVX512BW-NEXT: vpermi2w %zmm1, %zmm11, %zmm7 4862; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] 4863; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm11 4864; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] 4865; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4866; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 4867; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] 4868; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] 4869; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 4870; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm4 {%k1} 4871; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] 4872; AVX512BW-NEXT: vpermi2w %zmm1, %zmm4, %zmm2 4873; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] 4874; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 4875; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%rax) 4876; AVX512BW-NEXT: vmovdqa64 %zmm11, 256(%rax) 4877; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) 4878; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rax) 4879; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax) 4880; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) 4881; AVX512BW-NEXT: vzeroupper 4882; AVX512BW-NEXT: retq 4883; 4884; AVX512BW-FCP-LABEL: store_i16_stride6_vf32: 4885; AVX512BW-FCP: # %bb.0: 4886; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4887; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 4888; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 4889; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 4890; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm5 4891; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 4892; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 4893; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] 4894; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 4895; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm6 4896; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] 4897; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4898; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 4899; AVX512BW-FCP-NEXT: movw $9362, %cx # imm = 0x2492 4900; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 4901; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} 4902; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] 4903; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm7, %zmm8 4904; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] 4905; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm6 4906; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] 4907; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4908; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 4909; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] 4910; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 4911; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 4912; AVX512BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924 4913; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 4914; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} 4915; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] 4916; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm8, %zmm7 4917; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] 4918; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm8 4919; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] 4920; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4921; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 4922; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] 4923; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 4924; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 4925; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm9 {%k1} 4926; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] 4927; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm9, %zmm7 4928; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] 4929; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm9 4930; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] 4931; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4932; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 4933; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] 4934; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 4935; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 4936; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm10 {%k2} 4937; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] 4938; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm10, %zmm7 4939; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] 4940; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm10 4941; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] 4942; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4943; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 4944; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] 4945; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 4946; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 4947; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm11 {%k1} 4948; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] 4949; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm11, %zmm7 4950; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] 4951; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm11 4952; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] 4953; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4954; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 4955; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] 4956; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] 4957; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 4958; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm4 {%k1} 4959; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] 4960; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm4, %zmm2 4961; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] 4962; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 4963; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) 4964; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) 4965; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) 4966; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) 4967; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) 4968; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) 4969; AVX512BW-FCP-NEXT: vzeroupper 4970; AVX512BW-FCP-NEXT: retq 4971; 4972; AVX512DQ-BW-LABEL: store_i16_stride6_vf32: 4973; AVX512DQ-BW: # %bb.0: 4974; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 4975; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 4976; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm3 4977; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4 4978; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm5 4979; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm1 4980; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm0 4981; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] 4982; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 4983; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm6 4984; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] 4985; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4986; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 4987; AVX512DQ-BW-NEXT: movw $9362, %cx # imm = 0x2492 4988; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 4989; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} 4990; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] 4991; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm7, %zmm8 4992; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] 4993; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm6 4994; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] 4995; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4996; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 4997; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] 4998; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 4999; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 5000; AVX512DQ-BW-NEXT: movw $18724, %cx # imm = 0x4924 5001; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 5002; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} 5003; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] 5004; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm7 5005; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] 5006; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm8 5007; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] 5008; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 5009; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 5010; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] 5011; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 5012; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 5013; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm9 {%k1} 5014; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] 5015; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm9, %zmm7 5016; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] 5017; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm9 5018; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] 5019; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 5020; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 5021; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] 5022; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 5023; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 5024; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm10 {%k2} 5025; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] 5026; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm10, %zmm7 5027; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] 5028; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm10 5029; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] 5030; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 5031; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 5032; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] 5033; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 5034; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 5035; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm11 {%k1} 5036; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] 5037; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm11, %zmm7 5038; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] 5039; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm11 5040; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] 5041; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 5042; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 5043; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] 5044; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] 5045; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 5046; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm4 {%k1} 5047; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] 5048; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm4, %zmm2 5049; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] 5050; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 5051; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 320(%rax) 5052; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 256(%rax) 5053; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 192(%rax) 5054; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 128(%rax) 5055; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 64(%rax) 5056; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rax) 5057; AVX512DQ-BW-NEXT: vzeroupper 5058; AVX512DQ-BW-NEXT: retq 5059; 5060; AVX512DQ-BW-FCP-LABEL: store_i16_stride6_vf32: 5061; AVX512DQ-BW-FCP: # %bb.0: 5062; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 5063; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 5064; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 5065; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 5066; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm5 5067; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 5068; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 5069; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] 5070; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 5071; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm6 5072; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] 5073; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 5074; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 5075; AVX512DQ-BW-FCP-NEXT: movw $9362, %cx # imm = 0x2492 5076; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 5077; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} 5078; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] 5079; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm7, %zmm8 5080; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] 5081; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm6 5082; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] 5083; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 5084; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 5085; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] 5086; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 5087; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 5088; AVX512DQ-BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924 5089; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 5090; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} 5091; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] 5092; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm8, %zmm7 5093; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] 5094; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm8 5095; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] 5096; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 5097; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 5098; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] 5099; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 5100; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 5101; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm9 {%k1} 5102; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] 5103; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm9, %zmm7 5104; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] 5105; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm9 5106; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] 5107; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 5108; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 5109; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] 5110; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 5111; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 5112; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm10 {%k2} 5113; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] 5114; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm10, %zmm7 5115; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] 5116; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm10 5117; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] 5118; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 5119; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 5120; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] 5121; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 5122; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 5123; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm11 {%k1} 5124; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] 5125; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm11, %zmm7 5126; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] 5127; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm11 5128; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] 5129; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 5130; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 5131; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] 5132; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] 5133; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 5134; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm4 {%k1} 5135; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] 5136; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm4, %zmm2 5137; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] 5138; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 5139; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) 5140; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) 5141; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) 5142; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) 5143; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) 5144; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) 5145; AVX512DQ-BW-FCP-NEXT: vzeroupper 5146; AVX512DQ-BW-FCP-NEXT: retq 5147 %in.vec0 = load <32 x i16>, ptr %in.vecptr0, align 64 5148 %in.vec1 = load <32 x i16>, ptr %in.vecptr1, align 64 5149 %in.vec2 = load <32 x i16>, ptr %in.vecptr2, align 64 5150 %in.vec3 = load <32 x i16>, ptr %in.vecptr3, align 64 5151 %in.vec4 = load <32 x i16>, ptr %in.vecptr4, align 64 5152 %in.vec5 = load <32 x i16>, ptr %in.vecptr5, align 64 5153 %1 = shufflevector <32 x i16> %in.vec0, <32 x i16> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 5154 %2 = shufflevector <32 x i16> %in.vec2, <32 x i16> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 5155 %3 = shufflevector <32 x i16> %in.vec4, <32 x i16> %in.vec5, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 5156 %4 = shufflevector <64 x i16> %1, <64 x i16> %2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 5157 %5 = shufflevector <64 x i16> %3, <64 x i16> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 5158 %6 = shufflevector <128 x i16> %4, <128 x i16> %5, <192 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191> 5159 %interleaved.vec = shufflevector <192 x i16> %6, <192 x i16> poison, <192 x i32> <i32 0, i32 32, i32 64, i32 96, i32 128, i32 160, i32 1, i32 33, i32 65, i32 97, i32 129, i32 161, i32 2, i32 34, i32 66, i32 98, i32 130, i32 162, i32 3, i32 35, i32 67, i32 99, i32 131, i32 163, i32 4, i32 36, i32 68, i32 100, i32 132, i32 164, i32 5, i32 37, i32 69, i32 101, i32 133, i32 165, i32 6, i32 38, i32 70, i32 102, i32 134, i32 166, i32 7, i32 39, i32 71, i32 103, i32 135, i32 167, i32 8, i32 40, i32 72, i32 104, i32 136, i32 168, i32 9, i32 41, i32 73, i32 105, i32 137, i32 169, i32 10, i32 42, i32 74, i32 106, i32 138, i32 170, i32 11, i32 43, i32 75, i32 107, i32 139, i32 171, i32 12, i32 44, i32 76, i32 108, i32 140, i32 172, i32 13, i32 45, i32 77, i32 109, i32 141, i32 173, i32 14, i32 46, i32 78, i32 110, i32 142, i32 174, i32 15, i32 47, i32 79, i32 111, i32 143, i32 175, i32 16, i32 48, i32 80, i32 112, i32 144, i32 176, i32 17, i32 49, i32 81, i32 113, i32 145, i32 177, i32 18, i32 50, i32 82, i32 114, i32 146, i32 178, i32 19, i32 51, i32 83, i32 115, i32 147, i32 179, i32 20, i32 52, i32 84, i32 116, i32 148, i32 180, i32 21, i32 53, i32 85, i32 117, i32 149, i32 181, i32 22, i32 54, i32 86, i32 118, i32 150, i32 182, i32 23, i32 55, i32 87, i32 119, i32 151, i32 183, i32 24, i32 56, i32 88, i32 120, i32 152, i32 184, i32 25, i32 57, i32 89, i32 121, i32 153, i32 185, i32 26, i32 58, i32 90, i32 122, i32 154, i32 186, i32 27, i32 59, i32 91, i32 123, i32 155, i32 187, i32 28, i32 60, i32 92, i32 124, i32 156, i32 188, i32 29, i32 61, i32 93, i32 125, i32 157, i32 189, i32 30, i32 62, i32 94, i32 126, i32 158, i32 190, i32 31, i32 63, i32 95, i32 127, i32 159, i32 191> 5160 store <192 x i16> %interleaved.vec, ptr %out.vec, align 64 5161 ret void 5162} 5163 5164define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { 5165; SSE-LABEL: store_i16_stride6_vf64: 5166; SSE: # %bb.0: 5167; SSE-NEXT: subq $808, %rsp # imm = 0x328 5168; SSE-NEXT: movdqa (%rdi), %xmm10 5169; SSE-NEXT: movdqa 16(%rdi), %xmm11 5170; SSE-NEXT: movdqa (%rsi), %xmm4 5171; SSE-NEXT: movdqa 16(%rsi), %xmm1 5172; SSE-NEXT: movdqa (%rdx), %xmm12 5173; SSE-NEXT: movdqa 16(%rdx), %xmm2 5174; SSE-NEXT: movdqa (%rcx), %xmm6 5175; SSE-NEXT: movdqa 16(%rcx), %xmm3 5176; SSE-NEXT: movdqa (%r8), %xmm9 5177; SSE-NEXT: movdqa (%r9), %xmm8 5178; SSE-NEXT: movdqa %xmm12, %xmm0 5179; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] 5180; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5181; SSE-NEXT: movdqa %xmm10, %xmm7 5182; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] 5183; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5184; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[3,3] 5185; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[2,1,3,3,4,5,6,7] 5186; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm5[0,1] 5187; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] 5188; SSE-NEXT: movaps {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,65535,0] 5189; SSE-NEXT: andps %xmm14, %xmm7 5190; SSE-NEXT: movdqa %xmm8, %xmm5 5191; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5192; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] 5193; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] 5194; SSE-NEXT: movaps %xmm14, %xmm0 5195; SSE-NEXT: andnps %xmm8, %xmm0 5196; SSE-NEXT: orps %xmm7, %xmm0 5197; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5198; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] 5199; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5200; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] 5201; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5202; SSE-NEXT: movdqa %xmm10, %xmm4 5203; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm12[3,3] 5204; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,7,7] 5205; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm6[2,3] 5206; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] 5207; SSE-NEXT: andps %xmm14, %xmm4 5208; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,6,7] 5209; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] 5210; SSE-NEXT: movaps %xmm14, %xmm0 5211; SSE-NEXT: andnps %xmm6, %xmm0 5212; SSE-NEXT: orps %xmm4, %xmm0 5213; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5214; SSE-NEXT: movdqa %xmm2, %xmm0 5215; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 5216; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5217; SSE-NEXT: movdqa %xmm11, %xmm4 5218; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 5219; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5220; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm0[3,3] 5221; SSE-NEXT: movdqa 16(%r8), %xmm5 5222; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,1,3,3,4,5,6,7] 5223; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5224; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm6[0,1] 5225; SSE-NEXT: movdqa 16(%r9), %xmm7 5226; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,2,2,3,4,5,6,7] 5227; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5228; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] 5229; SSE-NEXT: movaps %xmm14, %xmm0 5230; SSE-NEXT: andnps %xmm6, %xmm0 5231; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] 5232; SSE-NEXT: andps %xmm14, %xmm4 5233; SSE-NEXT: orps %xmm4, %xmm0 5234; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5235; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 5236; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5237; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] 5238; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5239; SSE-NEXT: movdqa %xmm11, %xmm1 5240; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] 5241; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7] 5242; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[2,3] 5243; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,6,6,7] 5244; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] 5245; SSE-NEXT: movaps %xmm14, %xmm0 5246; SSE-NEXT: andnps %xmm3, %xmm0 5247; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] 5248; SSE-NEXT: andps %xmm14, %xmm1 5249; SSE-NEXT: orps %xmm1, %xmm0 5250; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5251; SSE-NEXT: movdqa 32(%rdx), %xmm2 5252; SSE-NEXT: movdqa 32(%rcx), %xmm1 5253; SSE-NEXT: movdqa %xmm2, %xmm0 5254; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 5255; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5256; SSE-NEXT: movdqa 32(%rdi), %xmm3 5257; SSE-NEXT: movdqa 32(%rsi), %xmm6 5258; SSE-NEXT: movdqa %xmm3, %xmm7 5259; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 5260; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5261; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[3,3] 5262; SSE-NEXT: movdqa 32(%r8), %xmm4 5263; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[2,1,3,3,4,5,6,7] 5264; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5265; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm8[0,1] 5266; SSE-NEXT: movdqa 32(%r9), %xmm5 5267; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7] 5268; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5269; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] 5270; SSE-NEXT: movaps %xmm14, %xmm0 5271; SSE-NEXT: andnps %xmm8, %xmm0 5272; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] 5273; SSE-NEXT: andps %xmm14, %xmm7 5274; SSE-NEXT: orps %xmm7, %xmm0 5275; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5276; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 5277; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5278; SSE-NEXT: movdqa %xmm3, %xmm1 5279; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] 5280; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5281; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] 5282; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,5,7,7] 5283; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm6[2,3] 5284; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,6,7] 5285; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] 5286; SSE-NEXT: movaps %xmm14, %xmm0 5287; SSE-NEXT: andnps %xmm6, %xmm0 5288; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] 5289; SSE-NEXT: andps %xmm14, %xmm1 5290; SSE-NEXT: orps %xmm1, %xmm0 5291; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5292; SSE-NEXT: movdqa 48(%rdx), %xmm2 5293; SSE-NEXT: movdqa 48(%rcx), %xmm1 5294; SSE-NEXT: movdqa %xmm2, %xmm0 5295; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 5296; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5297; SSE-NEXT: movdqa 48(%rdi), %xmm3 5298; SSE-NEXT: movdqa 48(%rsi), %xmm7 5299; SSE-NEXT: movdqa %xmm3, %xmm8 5300; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 5301; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill 5302; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[3,3] 5303; SSE-NEXT: movdqa 48(%r8), %xmm6 5304; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[2,1,3,3,4,5,6,7] 5305; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm11[0,1] 5306; SSE-NEXT: movdqa 48(%r9), %xmm4 5307; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[0,2,2,3,4,5,6,7] 5308; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5309; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] 5310; SSE-NEXT: movaps %xmm14, %xmm0 5311; SSE-NEXT: andnps %xmm11, %xmm0 5312; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] 5313; SSE-NEXT: andps %xmm14, %xmm8 5314; SSE-NEXT: orps %xmm8, %xmm0 5315; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5316; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 5317; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5318; SSE-NEXT: movdqa %xmm3, %xmm1 5319; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] 5320; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5321; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] 5322; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,5,7,7] 5323; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm7[2,3] 5324; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,6,7] 5325; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] 5326; SSE-NEXT: movaps %xmm14, %xmm0 5327; SSE-NEXT: andnps %xmm7, %xmm0 5328; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] 5329; SSE-NEXT: andps %xmm14, %xmm1 5330; SSE-NEXT: orps %xmm1, %xmm0 5331; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5332; SSE-NEXT: movdqa 64(%rdx), %xmm2 5333; SSE-NEXT: movdqa 64(%rcx), %xmm1 5334; SSE-NEXT: movdqa %xmm2, %xmm0 5335; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 5336; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5337; SSE-NEXT: movdqa 64(%rdi), %xmm3 5338; SSE-NEXT: movdqa 64(%rsi), %xmm8 5339; SSE-NEXT: movdqa %xmm3, %xmm11 5340; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] 5341; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5342; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm0[3,3] 5343; SSE-NEXT: movdqa 64(%r8), %xmm7 5344; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[2,1,3,3,4,5,6,7] 5345; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm12[0,1] 5346; SSE-NEXT: movdqa 64(%r9), %xmm4 5347; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm4[0,2,2,3,4,5,6,7] 5348; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5349; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] 5350; SSE-NEXT: movaps %xmm14, %xmm0 5351; SSE-NEXT: andnps %xmm12, %xmm0 5352; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3] 5353; SSE-NEXT: andps %xmm14, %xmm11 5354; SSE-NEXT: orps %xmm11, %xmm0 5355; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5356; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 5357; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5358; SSE-NEXT: movdqa %xmm3, %xmm1 5359; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 5360; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5361; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] 5362; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,7,7] 5363; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm8[2,3] 5364; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,6,6,7] 5365; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] 5366; SSE-NEXT: movaps %xmm14, %xmm0 5367; SSE-NEXT: andnps %xmm8, %xmm0 5368; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] 5369; SSE-NEXT: andps %xmm14, %xmm1 5370; SSE-NEXT: orps %xmm1, %xmm0 5371; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5372; SSE-NEXT: movdqa 80(%rdx), %xmm2 5373; SSE-NEXT: movdqa 80(%rcx), %xmm1 5374; SSE-NEXT: movdqa %xmm2, %xmm0 5375; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 5376; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5377; SSE-NEXT: movdqa 80(%rdi), %xmm3 5378; SSE-NEXT: movdqa 80(%rsi), %xmm11 5379; SSE-NEXT: movdqa %xmm3, %xmm12 5380; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] 5381; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5382; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm0[3,3] 5383; SSE-NEXT: movdqa 80(%r8), %xmm8 5384; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm8[2,1,3,3,4,5,6,7] 5385; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,2],xmm15[0,1] 5386; SSE-NEXT: movdqa 80(%r9), %xmm4 5387; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm4[0,2,2,3,4,5,6,7] 5388; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5389; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,1] 5390; SSE-NEXT: movaps %xmm14, %xmm0 5391; SSE-NEXT: andnps %xmm15, %xmm0 5392; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3] 5393; SSE-NEXT: andps %xmm14, %xmm12 5394; SSE-NEXT: orps %xmm12, %xmm0 5395; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5396; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 5397; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5398; SSE-NEXT: movdqa %xmm3, %xmm1 5399; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] 5400; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5401; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] 5402; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,6,5,7,7] 5403; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm11[2,3] 5404; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,6,6,7] 5405; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] 5406; SSE-NEXT: movaps %xmm14, %xmm0 5407; SSE-NEXT: andnps %xmm11, %xmm0 5408; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] 5409; SSE-NEXT: andps %xmm14, %xmm1 5410; SSE-NEXT: orps %xmm1, %xmm0 5411; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5412; SSE-NEXT: movdqa 96(%rdx), %xmm2 5413; SSE-NEXT: movdqa 96(%rcx), %xmm1 5414; SSE-NEXT: movdqa %xmm2, %xmm0 5415; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 5416; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5417; SSE-NEXT: movdqa 96(%rdi), %xmm3 5418; SSE-NEXT: movdqa 96(%rsi), %xmm12 5419; SSE-NEXT: movdqa %xmm3, %xmm15 5420; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] 5421; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5422; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm0[3,3] 5423; SSE-NEXT: movdqa 96(%r8), %xmm11 5424; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7] 5425; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm0[0,1] 5426; SSE-NEXT: movdqa 96(%r9), %xmm10 5427; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,2,2,3,4,5,6,7] 5428; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5429; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 5430; SSE-NEXT: movaps %xmm14, %xmm13 5431; SSE-NEXT: andnps %xmm0, %xmm13 5432; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] 5433; SSE-NEXT: andps %xmm14, %xmm15 5434; SSE-NEXT: orps %xmm15, %xmm13 5435; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5436; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 5437; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5438; SSE-NEXT: movdqa %xmm3, %xmm0 5439; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] 5440; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5441; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[3,3] 5442; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,6,5,7,7] 5443; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] 5444; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,6,6,7] 5445; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 5446; SSE-NEXT: movaps %xmm14, %xmm12 5447; SSE-NEXT: andnps %xmm1, %xmm12 5448; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] 5449; SSE-NEXT: andps %xmm14, %xmm0 5450; SSE-NEXT: orps %xmm0, %xmm12 5451; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5452; SSE-NEXT: movdqa 112(%rdx), %xmm4 5453; SSE-NEXT: movdqa 112(%rcx), %xmm5 5454; SSE-NEXT: movdqa %xmm4, %xmm2 5455; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] 5456; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5457; SSE-NEXT: movdqa 112(%rdi), %xmm0 5458; SSE-NEXT: movdqa 112(%rsi), %xmm1 5459; SSE-NEXT: movdqa %xmm0, %xmm15 5460; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] 5461; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5462; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm2[3,3] 5463; SSE-NEXT: movdqa 112(%r8), %xmm3 5464; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[2,1,3,3,4,5,6,7] 5465; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5466; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm13[0,1] 5467; SSE-NEXT: movdqa 112(%r9), %xmm2 5468; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm2[0,2,2,3,4,5,6,7] 5469; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] 5470; SSE-NEXT: movaps %xmm14, %xmm12 5471; SSE-NEXT: andnps %xmm13, %xmm12 5472; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] 5473; SSE-NEXT: andps %xmm14, %xmm15 5474; SSE-NEXT: orps %xmm15, %xmm12 5475; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5476; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 5477; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5478; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 5479; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5480; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm4[3,3] 5481; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,7,7] 5482; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] 5483; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] 5484; SSE-NEXT: andps %xmm14, %xmm0 5485; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] 5486; SSE-NEXT: movdqa %xmm2, %xmm15 5487; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 5488; SSE-NEXT: andnps %xmm1, %xmm14 5489; SSE-NEXT: orps %xmm0, %xmm14 5490; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5491; SSE-NEXT: movaps %xmm4, %xmm0 5492; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5493; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 5494; SSE-NEXT: movdqa %xmm9, %xmm1 5495; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[1,3] 5496; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] 5497; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5498; SSE-NEXT: movdqa %xmm2, %xmm13 5499; SSE-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5] 5500; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,65535,65535] 5501; SSE-NEXT: movdqa %xmm12, %xmm1 5502; SSE-NEXT: pandn %xmm13, %xmm1 5503; SSE-NEXT: andps %xmm12, %xmm0 5504; SSE-NEXT: por %xmm0, %xmm1 5505; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5506; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] 5507; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,1,1,1,4,5,6,7] 5508; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] 5509; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] 5510; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] 5511; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,0,1,1] 5512; SSE-NEXT: movdqa %xmm10, %xmm1 5513; SSE-NEXT: pandn %xmm13, %xmm1 5514; SSE-NEXT: andps %xmm10, %xmm0 5515; SSE-NEXT: por %xmm0, %xmm1 5516; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5517; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5518; SSE-NEXT: movaps %xmm4, %xmm0 5519; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5520; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 5521; SSE-NEXT: movdqa %xmm9, %xmm13 5522; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,1],xmm3[1,3] 5523; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[0,2] 5524; SSE-NEXT: movdqa %xmm2, %xmm1 5525; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[2,2,3,3] 5526; SSE-NEXT: pslld $16, %xmm1 5527; SSE-NEXT: movdqa %xmm1, %xmm2 5528; SSE-NEXT: movdqa %xmm12, %xmm1 5529; SSE-NEXT: pandn %xmm2, %xmm1 5530; SSE-NEXT: andps %xmm12, %xmm0 5531; SSE-NEXT: por %xmm0, %xmm1 5532; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5533; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] 5534; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 5535; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm4[1,1] 5536; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm3[0,2] 5537; SSE-NEXT: movdqa %xmm10, %xmm0 5538; SSE-NEXT: pandn %xmm13, %xmm0 5539; SSE-NEXT: andps %xmm10, %xmm9 5540; SSE-NEXT: por %xmm9, %xmm0 5541; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5542; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 5543; SSE-NEXT: movaps %xmm9, %xmm0 5544; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 5545; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] 5546; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5547; SSE-NEXT: movaps %xmm3, %xmm2 5548; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm5[1,3] 5549; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] 5550; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5551; SSE-NEXT: movdqa %xmm4, %xmm2 5552; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] 5553; SSE-NEXT: movdqa %xmm12, %xmm1 5554; SSE-NEXT: pandn %xmm2, %xmm1 5555; SSE-NEXT: andps %xmm12, %xmm0 5556; SSE-NEXT: por %xmm0, %xmm1 5557; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5558; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] 5559; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7] 5560; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] 5561; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[0,2] 5562; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] 5563; SSE-NEXT: movdqa %xmm4, %xmm5 5564; SSE-NEXT: movdqa %xmm10, %xmm1 5565; SSE-NEXT: pandn %xmm2, %xmm1 5566; SSE-NEXT: andps %xmm10, %xmm0 5567; SSE-NEXT: por %xmm0, %xmm1 5568; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5569; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 5570; SSE-NEXT: movaps %xmm9, %xmm0 5571; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5572; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] 5573; SSE-NEXT: movaps %xmm3, %xmm2 5574; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm4[1,3] 5575; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] 5576; SSE-NEXT: movdqa %xmm5, %xmm1 5577; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] 5578; SSE-NEXT: pslld $16, %xmm1 5579; SSE-NEXT: movdqa %xmm12, %xmm5 5580; SSE-NEXT: pandn %xmm1, %xmm5 5581; SSE-NEXT: andps %xmm12, %xmm0 5582; SSE-NEXT: por %xmm0, %xmm5 5583; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5584; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm9[1] 5585; SSE-NEXT: movdqa %xmm3, %xmm0 5586; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 5587; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] 5588; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] 5589; SSE-NEXT: movdqa %xmm10, %xmm1 5590; SSE-NEXT: pandn %xmm2, %xmm1 5591; SSE-NEXT: andps %xmm10, %xmm0 5592; SSE-NEXT: por %xmm0, %xmm1 5593; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5594; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 5595; SSE-NEXT: movaps %xmm5, %xmm0 5596; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5597; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] 5598; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5599; SSE-NEXT: movaps %xmm1, %xmm2 5600; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[1,3] 5601; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] 5602; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5603; SSE-NEXT: movdqa %xmm3, %xmm2 5604; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] 5605; SSE-NEXT: movdqa %xmm12, %xmm9 5606; SSE-NEXT: pandn %xmm2, %xmm9 5607; SSE-NEXT: andps %xmm12, %xmm0 5608; SSE-NEXT: por %xmm0, %xmm9 5609; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5610; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] 5611; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7] 5612; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] 5613; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] 5614; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] 5615; SSE-NEXT: movdqa %xmm10, %xmm4 5616; SSE-NEXT: pandn %xmm2, %xmm4 5617; SSE-NEXT: andps %xmm10, %xmm0 5618; SSE-NEXT: por %xmm0, %xmm4 5619; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5620; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 5621; SSE-NEXT: movaps %xmm5, %xmm0 5622; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5623; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] 5624; SSE-NEXT: movaps %xmm1, %xmm2 5625; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm4[1,3] 5626; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] 5627; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] 5628; SSE-NEXT: pslld $16, %xmm3 5629; SSE-NEXT: movdqa %xmm12, %xmm9 5630; SSE-NEXT: pandn %xmm3, %xmm9 5631; SSE-NEXT: andps %xmm12, %xmm0 5632; SSE-NEXT: por %xmm0, %xmm9 5633; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5634; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] 5635; SSE-NEXT: movdqa %xmm1, %xmm0 5636; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 5637; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] 5638; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] 5639; SSE-NEXT: movdqa %xmm10, %xmm1 5640; SSE-NEXT: pandn %xmm2, %xmm1 5641; SSE-NEXT: andps %xmm10, %xmm0 5642; SSE-NEXT: por %xmm0, %xmm1 5643; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5644; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5645; SSE-NEXT: movaps %xmm4, %xmm0 5646; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload 5647; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 5648; SSE-NEXT: movdqa %xmm6, %xmm2 5649; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3] 5650; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] 5651; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5652; SSE-NEXT: movdqa %xmm1, %xmm2 5653; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] 5654; SSE-NEXT: movdqa %xmm12, %xmm5 5655; SSE-NEXT: pandn %xmm2, %xmm5 5656; SSE-NEXT: andps %xmm12, %xmm0 5657; SSE-NEXT: por %xmm0, %xmm5 5658; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5659; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] 5660; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,1,1,4,5,6,7] 5661; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] 5662; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] 5663; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] 5664; SSE-NEXT: movdqa %xmm10, %xmm3 5665; SSE-NEXT: pandn %xmm2, %xmm3 5666; SSE-NEXT: andps %xmm10, %xmm0 5667; SSE-NEXT: por %xmm0, %xmm3 5668; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill 5669; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5670; SSE-NEXT: movaps %xmm4, %xmm0 5671; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5672; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 5673; SSE-NEXT: movdqa %xmm6, %xmm2 5674; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3] 5675; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] 5676; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] 5677; SSE-NEXT: pslld $16, %xmm1 5678; SSE-NEXT: movdqa %xmm12, %xmm5 5679; SSE-NEXT: pandn %xmm1, %xmm5 5680; SSE-NEXT: andps %xmm12, %xmm0 5681; SSE-NEXT: por %xmm0, %xmm5 5682; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5683; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] 5684; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 5685; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm4[1,1] 5686; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[0,2] 5687; SSE-NEXT: movdqa %xmm10, %xmm0 5688; SSE-NEXT: pandn %xmm2, %xmm0 5689; SSE-NEXT: andps %xmm10, %xmm6 5690; SSE-NEXT: por %xmm6, %xmm0 5691; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5692; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5693; SSE-NEXT: movaps %xmm4, %xmm0 5694; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5695; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 5696; SSE-NEXT: movdqa %xmm7, %xmm2 5697; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3] 5698; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] 5699; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5700; SSE-NEXT: movdqa %xmm1, %xmm2 5701; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] 5702; SSE-NEXT: movdqa %xmm12, %xmm5 5703; SSE-NEXT: pandn %xmm2, %xmm5 5704; SSE-NEXT: andps %xmm12, %xmm0 5705; SSE-NEXT: por %xmm0, %xmm5 5706; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5707; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] 5708; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[1,1,1,1,4,5,6,7] 5709; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] 5710; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] 5711; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] 5712; SSE-NEXT: movdqa %xmm10, %xmm3 5713; SSE-NEXT: pandn %xmm2, %xmm3 5714; SSE-NEXT: andps %xmm10, %xmm0 5715; SSE-NEXT: por %xmm0, %xmm3 5716; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5717; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5718; SSE-NEXT: movaps %xmm4, %xmm0 5719; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5720; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 5721; SSE-NEXT: movdqa %xmm7, %xmm2 5722; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3] 5723; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] 5724; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] 5725; SSE-NEXT: pslld $16, %xmm1 5726; SSE-NEXT: movdqa %xmm12, %xmm5 5727; SSE-NEXT: pandn %xmm1, %xmm5 5728; SSE-NEXT: andps %xmm12, %xmm0 5729; SSE-NEXT: por %xmm0, %xmm5 5730; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5731; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] 5732; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 5733; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm4[1,1] 5734; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[0,2] 5735; SSE-NEXT: movdqa %xmm10, %xmm0 5736; SSE-NEXT: pandn %xmm2, %xmm0 5737; SSE-NEXT: andps %xmm10, %xmm7 5738; SSE-NEXT: por %xmm7, %xmm0 5739; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5740; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5741; SSE-NEXT: movaps %xmm4, %xmm0 5742; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5743; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 5744; SSE-NEXT: movdqa %xmm8, %xmm2 5745; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3] 5746; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] 5747; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5748; SSE-NEXT: movdqa %xmm1, %xmm2 5749; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] 5750; SSE-NEXT: movdqa %xmm12, %xmm5 5751; SSE-NEXT: pandn %xmm2, %xmm5 5752; SSE-NEXT: andps %xmm12, %xmm0 5753; SSE-NEXT: por %xmm0, %xmm5 5754; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5755; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] 5756; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7] 5757; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] 5758; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] 5759; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] 5760; SSE-NEXT: movdqa %xmm10, %xmm3 5761; SSE-NEXT: pandn %xmm2, %xmm3 5762; SSE-NEXT: andps %xmm10, %xmm0 5763; SSE-NEXT: por %xmm0, %xmm3 5764; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5765; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5766; SSE-NEXT: movaps %xmm4, %xmm0 5767; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5768; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 5769; SSE-NEXT: movdqa %xmm8, %xmm2 5770; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3] 5771; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] 5772; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] 5773; SSE-NEXT: pslld $16, %xmm1 5774; SSE-NEXT: movdqa %xmm12, %xmm5 5775; SSE-NEXT: pandn %xmm1, %xmm5 5776; SSE-NEXT: andps %xmm12, %xmm0 5777; SSE-NEXT: por %xmm0, %xmm5 5778; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5779; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] 5780; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 5781; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm4[1,1] 5782; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[0,2] 5783; SSE-NEXT: movdqa %xmm10, %xmm0 5784; SSE-NEXT: pandn %xmm2, %xmm0 5785; SSE-NEXT: andps %xmm10, %xmm8 5786; SSE-NEXT: por %xmm8, %xmm0 5787; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5788; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5789; SSE-NEXT: movaps %xmm4, %xmm0 5790; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5791; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 5792; SSE-NEXT: movdqa %xmm11, %xmm2 5793; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3] 5794; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] 5795; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5796; SSE-NEXT: movdqa %xmm1, %xmm2 5797; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] 5798; SSE-NEXT: movdqa %xmm12, %xmm13 5799; SSE-NEXT: pandn %xmm2, %xmm13 5800; SSE-NEXT: andps %xmm12, %xmm0 5801; SSE-NEXT: por %xmm0, %xmm13 5802; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] 5803; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,1,1,1,4,5,6,7] 5804; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] 5805; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] 5806; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] 5807; SSE-NEXT: movdqa %xmm10, %xmm9 5808; SSE-NEXT: pandn %xmm2, %xmm9 5809; SSE-NEXT: andps %xmm10, %xmm0 5810; SSE-NEXT: por %xmm0, %xmm9 5811; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 5812; SSE-NEXT: movaps %xmm7, %xmm0 5813; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5814; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 5815; SSE-NEXT: movdqa %xmm11, %xmm2 5816; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3] 5817; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] 5818; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] 5819; SSE-NEXT: pslld $16, %xmm1 5820; SSE-NEXT: movdqa %xmm12, %xmm6 5821; SSE-NEXT: pandn %xmm1, %xmm6 5822; SSE-NEXT: andps %xmm12, %xmm0 5823; SSE-NEXT: por %xmm0, %xmm6 5824; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] 5825; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 5826; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm7[1,1] 5827; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm3[0,2] 5828; SSE-NEXT: movdqa %xmm10, %xmm4 5829; SSE-NEXT: pandn %xmm5, %xmm4 5830; SSE-NEXT: andps %xmm10, %xmm11 5831; SSE-NEXT: por %xmm11, %xmm4 5832; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 5833; SSE-NEXT: movaps %xmm7, %xmm0 5834; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5835; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 5836; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5837; SSE-NEXT: movaps %xmm1, %xmm5 5838; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[1,3] 5839; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[0,2] 5840; SSE-NEXT: movdqa %xmm15, %xmm5 5841; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] 5842; SSE-NEXT: movdqa %xmm12, %xmm11 5843; SSE-NEXT: pandn %xmm5, %xmm11 5844; SSE-NEXT: andps %xmm12, %xmm0 5845; SSE-NEXT: por %xmm0, %xmm11 5846; SSE-NEXT: movaps %xmm7, %xmm0 5847; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] 5848; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[1,1,1,1,4,5,6,7] 5849; SSE-NEXT: movaps %xmm1, %xmm7 5850; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] 5851; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2] 5852; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] 5853; SSE-NEXT: movdqa %xmm15, %xmm8 5854; SSE-NEXT: movdqa %xmm10, %xmm15 5855; SSE-NEXT: pandn %xmm1, %xmm15 5856; SSE-NEXT: andps %xmm10, %xmm5 5857; SSE-NEXT: por %xmm5, %xmm15 5858; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5859; SSE-NEXT: movaps %xmm3, %xmm1 5860; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5861; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 5862; SSE-NEXT: movaps %xmm7, %xmm5 5863; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm2[1,3] 5864; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2] 5865; SSE-NEXT: andps %xmm12, %xmm1 5866; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,2,3,3] 5867; SSE-NEXT: pslld $16, %xmm8 5868; SSE-NEXT: pandn %xmm8, %xmm12 5869; SSE-NEXT: por %xmm1, %xmm12 5870; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] 5871; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 5872; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm3[1,1] 5873; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm2[0,2] 5874; SSE-NEXT: andps %xmm10, %xmm7 5875; SSE-NEXT: pandn %xmm5, %xmm10 5876; SSE-NEXT: por %xmm7, %xmm10 5877; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 5878; SSE-NEXT: movdqa %xmm10, 736(%rax) 5879; SSE-NEXT: movdqa %xmm12, 720(%rax) 5880; SSE-NEXT: movdqa %xmm15, 688(%rax) 5881; SSE-NEXT: movdqa %xmm11, 672(%rax) 5882; SSE-NEXT: movdqa %xmm4, 640(%rax) 5883; SSE-NEXT: movdqa %xmm6, 624(%rax) 5884; SSE-NEXT: movdqa %xmm9, 592(%rax) 5885; SSE-NEXT: movdqa %xmm13, 576(%rax) 5886; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5887; SSE-NEXT: movaps %xmm0, 544(%rax) 5888; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5889; SSE-NEXT: movaps %xmm0, 528(%rax) 5890; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5891; SSE-NEXT: movaps %xmm0, 496(%rax) 5892; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5893; SSE-NEXT: movaps %xmm0, 480(%rax) 5894; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5895; SSE-NEXT: movaps %xmm0, 448(%rax) 5896; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5897; SSE-NEXT: movaps %xmm0, 432(%rax) 5898; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5899; SSE-NEXT: movaps %xmm0, 400(%rax) 5900; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5901; SSE-NEXT: movaps %xmm0, 384(%rax) 5902; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5903; SSE-NEXT: movaps %xmm0, 352(%rax) 5904; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5905; SSE-NEXT: movaps %xmm0, 336(%rax) 5906; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 5907; SSE-NEXT: movaps %xmm0, 304(%rax) 5908; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5909; SSE-NEXT: movaps %xmm0, 288(%rax) 5910; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5911; SSE-NEXT: movaps %xmm0, 256(%rax) 5912; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5913; SSE-NEXT: movaps %xmm0, 240(%rax) 5914; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5915; SSE-NEXT: movaps %xmm0, 208(%rax) 5916; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5917; SSE-NEXT: movaps %xmm0, 192(%rax) 5918; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5919; SSE-NEXT: movaps %xmm0, 160(%rax) 5920; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5921; SSE-NEXT: movaps %xmm0, 144(%rax) 5922; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5923; SSE-NEXT: movaps %xmm0, 112(%rax) 5924; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5925; SSE-NEXT: movaps %xmm0, 96(%rax) 5926; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5927; SSE-NEXT: movaps %xmm0, 64(%rax) 5928; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5929; SSE-NEXT: movaps %xmm0, 48(%rax) 5930; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5931; SSE-NEXT: movaps %xmm0, 16(%rax) 5932; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5933; SSE-NEXT: movaps %xmm0, (%rax) 5934; SSE-NEXT: movaps %xmm14, 752(%rax) 5935; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5936; SSE-NEXT: movaps %xmm0, 704(%rax) 5937; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5938; SSE-NEXT: movaps %xmm0, 656(%rax) 5939; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5940; SSE-NEXT: movaps %xmm0, 608(%rax) 5941; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5942; SSE-NEXT: movaps %xmm0, 560(%rax) 5943; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5944; SSE-NEXT: movaps %xmm0, 512(%rax) 5945; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5946; SSE-NEXT: movaps %xmm0, 464(%rax) 5947; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5948; SSE-NEXT: movaps %xmm0, 416(%rax) 5949; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5950; SSE-NEXT: movaps %xmm0, 368(%rax) 5951; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5952; SSE-NEXT: movaps %xmm0, 320(%rax) 5953; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5954; SSE-NEXT: movaps %xmm0, 272(%rax) 5955; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5956; SSE-NEXT: movaps %xmm0, 224(%rax) 5957; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5958; SSE-NEXT: movaps %xmm0, 176(%rax) 5959; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5960; SSE-NEXT: movaps %xmm0, 128(%rax) 5961; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5962; SSE-NEXT: movaps %xmm0, 80(%rax) 5963; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5964; SSE-NEXT: movaps %xmm0, 32(%rax) 5965; SSE-NEXT: addq $808, %rsp # imm = 0x328 5966; SSE-NEXT: retq 5967; 5968; AVX-LABEL: store_i16_stride6_vf64: 5969; AVX: # %bb.0: 5970; AVX-NEXT: subq $504, %rsp # imm = 0x1F8 5971; AVX-NEXT: vmovdqa 80(%rcx), %xmm1 5972; AVX-NEXT: vmovdqa 80(%rdx), %xmm2 5973; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 5974; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] 5975; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 5976; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] 5977; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm2 5978; AVX-NEXT: vmovdqa 80(%rsi), %xmm3 5979; AVX-NEXT: vmovdqa 80(%rdi), %xmm5 5980; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] 5981; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] 5982; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 5983; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,0,1] 5984; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 5985; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] 5986; AVX-NEXT: vmovdqa 80(%r8), %xmm2 5987; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] 5988; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 5989; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0],xmm6[1,2],xmm3[3] 5990; AVX-NEXT: vmovdqa 80(%r9), %xmm3 5991; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] 5992; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] 5993; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6],xmm8[7] 5994; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5995; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6 5996; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7] 5997; AVX-NEXT: vpslld $16, %xmm3, %xmm7 5998; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7] 5999; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6000; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] 6001; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 6002; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] 6003; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] 6004; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 6005; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] 6006; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 6007; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3] 6008; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,2,3,3] 6009; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] 6010; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6011; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 6012; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,7,7] 6013; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] 6014; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] 6015; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] 6016; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] 6017; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] 6018; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6019; AVX-NEXT: vmovdqa 64(%rsi), %xmm8 6020; AVX-NEXT: vmovdqa 64(%rdi), %xmm9 6021; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 6022; AVX-NEXT: vmovdqa 64(%rcx), %xmm10 6023; AVX-NEXT: vmovdqa 64(%rdx), %xmm11 6024; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] 6025; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,2] 6026; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[2,2,3,3] 6027; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 6028; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] 6029; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 6030; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] 6031; AVX-NEXT: vmovdqa 64(%r8), %xmm5 6032; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 6033; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0],xmm6[1],xmm12[2,3] 6034; AVX-NEXT: vmovdqa 64(%r9), %xmm6 6035; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[2,2,3,3] 6036; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7] 6037; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6038; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 6039; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,6,5,7,7] 6040; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] 6041; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5],xmm13[6,7] 6042; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm6[0,1,2,3,4,6,6,7] 6043; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] 6044; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6],xmm13[7] 6045; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6046; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] 6047; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] 6048; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 6049; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,0,1] 6050; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 6051; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] 6052; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm2[0],xmm0[3] 6053; AVX-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] 6054; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7] 6055; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6056; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 6057; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 6058; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 6059; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] 6060; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] 6061; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6062; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] 6063; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 6064; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] 6065; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 6066; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 6067; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] 6068; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 6069; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] 6070; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1],xmm5[0],xmm1[3] 6071; AVX-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5] 6072; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm8[5],xmm3[6,7] 6073; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6074; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 6075; AVX-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero 6076; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] 6077; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,0,1,1] 6078; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] 6079; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6080; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 6081; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] 6082; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 6083; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 6084; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] 6085; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 6086; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 6087; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,3,3,4,5,6,7] 6088; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 6089; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] 6090; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7] 6091; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 6092; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6],xmm2[7] 6093; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6094; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 6095; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5],xmm0[6,7] 6096; AVX-NEXT: vpslld $16, %xmm6, %xmm1 6097; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] 6098; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6099; AVX-NEXT: vmovdqa 48(%rcx), %xmm1 6100; AVX-NEXT: vmovdqa 48(%rdx), %xmm2 6101; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 6102; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] 6103; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 6104; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] 6105; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm2 6106; AVX-NEXT: vmovdqa 48(%rsi), %xmm3 6107; AVX-NEXT: vmovdqa 48(%rdi), %xmm5 6108; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] 6109; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 6110; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 6111; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] 6112; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 6113; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] 6114; AVX-NEXT: vmovdqa 48(%r8), %xmm2 6115; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] 6116; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 6117; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0],xmm6[1,2],xmm3[3] 6118; AVX-NEXT: vmovdqa 48(%r9), %xmm3 6119; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] 6120; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] 6121; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6],xmm8[7] 6122; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6123; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6 6124; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7] 6125; AVX-NEXT: vpslld $16, %xmm3, %xmm7 6126; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7] 6127; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6128; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] 6129; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 6130; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] 6131; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] 6132; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 6133; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] 6134; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 6135; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3] 6136; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,2,3,3] 6137; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] 6138; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6139; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 6140; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,7,7] 6141; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] 6142; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] 6143; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] 6144; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] 6145; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] 6146; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6147; AVX-NEXT: vmovdqa 32(%rcx), %xmm8 6148; AVX-NEXT: vmovdqa 32(%rdx), %xmm9 6149; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 6150; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2] 6151; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] 6152; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 6153; AVX-NEXT: vmovdqa 32(%rsi), %xmm10 6154; AVX-NEXT: vmovdqa 32(%rdi), %xmm11 6155; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] 6156; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] 6157; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 6158; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] 6159; AVX-NEXT: vmovdqa 32(%r8), %xmm4 6160; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 6161; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0],xmm5[1],xmm12[2,3] 6162; AVX-NEXT: vmovdqa 32(%r9), %xmm5 6163; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[2,2,3,3] 6164; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7] 6165; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6166; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 6167; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm4[0,1,2,3,6,5,7,7] 6168; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] 6169; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5],xmm13[6,7] 6170; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,6,6,7] 6171; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] 6172; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6],xmm13[7] 6173; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6174; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] 6175; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] 6176; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 6177; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,0,1] 6178; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 6179; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] 6180; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm2[0],xmm0[3] 6181; AVX-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] 6182; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7] 6183; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6184; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 6185; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 6186; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 6187; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] 6188; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] 6189; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6190; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 6191; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 6192; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] 6193; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 6194; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] 6195; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] 6196; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 6197; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] 6198; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1],xmm4[0],xmm1[3] 6199; AVX-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] 6200; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm8[5],xmm3[6,7] 6201; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6202; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 6203; AVX-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 6204; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] 6205; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,0,1,1] 6206; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] 6207; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6208; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 6209; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] 6210; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 6211; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 6212; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] 6213; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 6214; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 6215; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] 6216; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 6217; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] 6218; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] 6219; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 6220; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6],xmm2[7] 6221; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6222; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 6223; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5],xmm0[6,7] 6224; AVX-NEXT: vpslld $16, %xmm5, %xmm1 6225; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] 6226; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6227; AVX-NEXT: vmovdqa 112(%rcx), %xmm1 6228; AVX-NEXT: vmovdqa 112(%rdx), %xmm2 6229; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 6230; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 6231; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] 6232; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] 6233; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 6234; AVX-NEXT: vmovdqa 112(%rsi), %xmm3 6235; AVX-NEXT: vmovdqa 112(%rdi), %xmm5 6236; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] 6237; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 6238; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 6239; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] 6240; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 6241; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] 6242; AVX-NEXT: vmovdqa 112(%r8), %xmm2 6243; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] 6244; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 6245; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0],xmm6[1,2],xmm3[3] 6246; AVX-NEXT: vmovdqa 112(%r9), %xmm3 6247; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] 6248; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] 6249; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6],xmm8[7] 6250; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6251; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6 6252; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7] 6253; AVX-NEXT: vpslld $16, %xmm3, %xmm7 6254; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7] 6255; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6256; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] 6257; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 6258; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] 6259; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] 6260; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 6261; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] 6262; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 6263; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3] 6264; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,2,3,3] 6265; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] 6266; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6267; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 6268; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,7,7] 6269; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] 6270; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] 6271; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] 6272; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] 6273; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] 6274; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6275; AVX-NEXT: vmovdqa 96(%rcx), %xmm8 6276; AVX-NEXT: vmovdqa 96(%rdx), %xmm9 6277; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 6278; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2] 6279; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] 6280; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 6281; AVX-NEXT: vmovdqa 96(%rsi), %xmm10 6282; AVX-NEXT: vmovdqa 96(%rdi), %xmm11 6283; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] 6284; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] 6285; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 6286; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] 6287; AVX-NEXT: vmovdqa 96(%r8), %xmm4 6288; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 6289; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0],xmm5[1],xmm12[2,3] 6290; AVX-NEXT: vmovdqa 96(%r9), %xmm5 6291; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[2,2,3,3] 6292; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7] 6293; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6294; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 6295; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm4[0,1,2,3,6,5,7,7] 6296; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] 6297; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5],xmm13[6,7] 6298; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,6,6,7] 6299; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] 6300; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6],xmm13[7] 6301; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6302; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] 6303; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] 6304; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 6305; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,0,1] 6306; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 6307; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] 6308; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm2[0],xmm0[3] 6309; AVX-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] 6310; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7] 6311; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill 6312; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 6313; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 6314; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 6315; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] 6316; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] 6317; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6318; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 6319; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 6320; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] 6321; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 6322; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] 6323; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] 6324; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 6325; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] 6326; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1],xmm4[0],xmm1[3] 6327; AVX-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] 6328; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm8[5],xmm3[6,7] 6329; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6330; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 6331; AVX-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 6332; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] 6333; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,0,1,1] 6334; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] 6335; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6336; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 6337; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] 6338; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 6339; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 6340; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] 6341; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 6342; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 6343; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] 6344; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 6345; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] 6346; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] 6347; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 6348; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6],xmm2[7] 6349; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6350; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 6351; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5],xmm0[6,7] 6352; AVX-NEXT: vpslld $16, %xmm5, %xmm1 6353; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] 6354; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6355; AVX-NEXT: vmovdqa 16(%rcx), %xmm0 6356; AVX-NEXT: vmovdqa 16(%rdx), %xmm1 6357; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 6358; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 6359; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] 6360; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] 6361; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 6362; AVX-NEXT: vmovdqa 16(%rsi), %xmm2 6363; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 6364; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 6365; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 6366; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] 6367; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,1] 6368; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 6369; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 6370; AVX-NEXT: vmovdqa 16(%r8), %xmm3 6371; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] 6372; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 6373; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0],xmm1[1,2],xmm2[3] 6374; AVX-NEXT: vmovdqa 16(%r9), %xmm2 6375; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] 6376; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] 6377; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6],xmm6[7] 6378; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6379; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 6380; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] 6381; AVX-NEXT: vpslld $16, %xmm2, %xmm5 6382; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5],xmm1[6,7] 6383; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6384; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] 6385; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 6386; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] 6387; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 6388; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 6389; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 6390; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 6391; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] 6392; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] 6393; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7] 6394; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6395; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 6396; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,7,7] 6397; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 6398; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 6399; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] 6400; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 6401; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] 6402; AVX-NEXT: vmovdqa (%rcx), %xmm9 6403; AVX-NEXT: vmovdqa (%rdx), %xmm8 6404; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] 6405; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] 6406; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] 6407; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 6408; AVX-NEXT: vmovdqa (%rsi), %xmm7 6409; AVX-NEXT: vmovdqa (%rdi), %xmm6 6410; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] 6411; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] 6412; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 6413; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 6414; AVX-NEXT: vmovdqa (%r8), %xmm1 6415; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 6416; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0],xmm0[1],xmm12[2,3] 6417; AVX-NEXT: vmovdqa (%r9), %xmm0 6418; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] 6419; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3],xmm13[4,5,6,7] 6420; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 6421; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,6,5,7,7] 6422; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] 6423; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3,4,5],xmm15[6,7] 6424; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm0[0,1,2,3,4,6,6,7] 6425; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] 6426; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2,3,4,5,6],xmm15[7] 6427; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,0,1,1] 6428; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] 6429; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm15, %ymm11 6430; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,1,0,1] 6431; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm15, %ymm10 6432; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7] 6433; AVX-NEXT: vinsertps {{.*#+}} xmm10 = xmm11[0,1],xmm3[0],xmm11[3] 6434; AVX-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] 6435; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm15[5],xmm10[6,7] 6436; AVX-NEXT: vextractf128 $1, %ymm11, %xmm11 6437; AVX-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 6438; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3],xmm11[4,5,6,7] 6439; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] 6440; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] 6441; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] 6442; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] 6443; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[1,1,2,2] 6444; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 6445; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] 6446; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] 6447; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm7 6448; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] 6449; AVX-NEXT: vinsertps {{.*#+}} xmm8 = xmm7[0,1],xmm1[0],xmm7[3] 6450; AVX-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 6451; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5],xmm8[6,7] 6452; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7 6453; AVX-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 6454; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7] 6455; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] 6456; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3],xmm7[4,5,6,7] 6457; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 6458; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] 6459; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 6460; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] 6461; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] 6462; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 6463; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 6464; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,1,3,3,4,5,6,7] 6465; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] 6466; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm3[1,2],xmm4[3] 6467; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] 6468; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] 6469; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] 6470; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 6471; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7] 6472; AVX-NEXT: vpslld $16, %xmm0, %xmm0 6473; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] 6474; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 6475; AVX-NEXT: vmovdqa %xmm0, 48(%rax) 6476; AVX-NEXT: vmovdqa %xmm4, 32(%rax) 6477; AVX-NEXT: vmovdqa %xmm7, 16(%rax) 6478; AVX-NEXT: vmovdqa %xmm8, (%rax) 6479; AVX-NEXT: vmovdqa %xmm2, 112(%rax) 6480; AVX-NEXT: vmovdqa %xmm10, 96(%rax) 6481; AVX-NEXT: vmovdqa %xmm12, 80(%rax) 6482; AVX-NEXT: vmovdqa %xmm13, 64(%rax) 6483; AVX-NEXT: vmovdqa %xmm14, 176(%rax) 6484; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6485; AVX-NEXT: vmovaps %xmm0, 160(%rax) 6486; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6487; AVX-NEXT: vmovaps %xmm0, 144(%rax) 6488; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6489; AVX-NEXT: vmovaps %xmm0, 128(%rax) 6490; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6491; AVX-NEXT: vmovaps %xmm0, 624(%rax) 6492; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6493; AVX-NEXT: vmovaps %xmm0, 608(%rax) 6494; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6495; AVX-NEXT: vmovaps %xmm0, 592(%rax) 6496; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6497; AVX-NEXT: vmovaps %xmm0, 576(%rax) 6498; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6499; AVX-NEXT: vmovaps %xmm0, 688(%rax) 6500; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 6501; AVX-NEXT: vmovaps %xmm0, 672(%rax) 6502; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6503; AVX-NEXT: vmovaps %xmm0, 656(%rax) 6504; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6505; AVX-NEXT: vmovaps %xmm0, 640(%rax) 6506; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6507; AVX-NEXT: vmovaps %xmm0, 752(%rax) 6508; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6509; AVX-NEXT: vmovaps %xmm0, 736(%rax) 6510; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6511; AVX-NEXT: vmovaps %xmm0, 720(%rax) 6512; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6513; AVX-NEXT: vmovaps %xmm0, 704(%rax) 6514; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6515; AVX-NEXT: vmovaps %xmm0, 240(%rax) 6516; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6517; AVX-NEXT: vmovaps %xmm0, 224(%rax) 6518; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6519; AVX-NEXT: vmovaps %xmm0, 208(%rax) 6520; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6521; AVX-NEXT: vmovaps %xmm0, 192(%rax) 6522; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6523; AVX-NEXT: vmovaps %xmm0, 304(%rax) 6524; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6525; AVX-NEXT: vmovaps %xmm0, 288(%rax) 6526; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6527; AVX-NEXT: vmovaps %xmm0, 272(%rax) 6528; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6529; AVX-NEXT: vmovaps %xmm0, 256(%rax) 6530; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6531; AVX-NEXT: vmovaps %xmm0, 368(%rax) 6532; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6533; AVX-NEXT: vmovaps %xmm0, 352(%rax) 6534; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6535; AVX-NEXT: vmovaps %xmm0, 336(%rax) 6536; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6537; AVX-NEXT: vmovaps %xmm0, 320(%rax) 6538; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6539; AVX-NEXT: vmovaps %xmm0, 432(%rax) 6540; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6541; AVX-NEXT: vmovaps %xmm0, 416(%rax) 6542; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6543; AVX-NEXT: vmovaps %xmm0, 400(%rax) 6544; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6545; AVX-NEXT: vmovaps %xmm0, 384(%rax) 6546; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6547; AVX-NEXT: vmovaps %xmm0, 496(%rax) 6548; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6549; AVX-NEXT: vmovaps %xmm0, 480(%rax) 6550; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6551; AVX-NEXT: vmovaps %xmm0, 464(%rax) 6552; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6553; AVX-NEXT: vmovaps %xmm0, 448(%rax) 6554; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6555; AVX-NEXT: vmovaps %xmm0, 560(%rax) 6556; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6557; AVX-NEXT: vmovaps %xmm0, 544(%rax) 6558; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6559; AVX-NEXT: vmovaps %xmm0, 528(%rax) 6560; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6561; AVX-NEXT: vmovaps %xmm0, 512(%rax) 6562; AVX-NEXT: addq $504, %rsp # imm = 0x1F8 6563; AVX-NEXT: vzeroupper 6564; AVX-NEXT: retq 6565; 6566; AVX2-LABEL: store_i16_stride6_vf64: 6567; AVX2: # %bb.0: 6568; AVX2-NEXT: subq $1544, %rsp # imm = 0x608 6569; AVX2-NEXT: vmovdqa (%rcx), %xmm12 6570; AVX2-NEXT: vmovdqa 32(%rcx), %xmm5 6571; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6572; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 6573; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6574; AVX2-NEXT: vmovdqa (%rdx), %xmm11 6575; AVX2-NEXT: vmovdqa 32(%rdx), %xmm6 6576; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6577; AVX2-NEXT: vpsrldq {{.*#+}} xmm1 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 6578; AVX2-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6579; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 6580; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 6581; AVX2-NEXT: vmovdqa (%rsi), %xmm14 6582; AVX2-NEXT: vmovdqa 32(%rsi), %xmm13 6583; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,1,2,1] 6584; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] 6585; AVX2-NEXT: vmovdqa (%rdi), %xmm10 6586; AVX2-NEXT: vmovdqa 32(%rdi), %xmm8 6587; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,2,1] 6588; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] 6589; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 6590; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 6591; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 6592; AVX2-NEXT: vmovdqa (%r8), %xmm1 6593; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6594; AVX2-NEXT: vmovdqa 32(%r8), %xmm4 6595; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6596; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] 6597; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 6598; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] 6599; AVX2-NEXT: vmovdqa (%r9), %xmm0 6600; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6601; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 6602; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 6603; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] 6604; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] 6605; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 6606; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6607; AVX2-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 6608; AVX2-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 6609; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 6610; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,1,2,1] 6611; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6612; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] 6613; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,2,1] 6614; AVX2-NEXT: vmovdqa %xmm8, %xmm5 6615; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6616; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] 6617; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 6618; AVX2-NEXT: vmovdqa 32(%r9), %xmm3 6619; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6620; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 6621; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 6622; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 6623; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7] 6624; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 6625; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 6626; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] 6627; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] 6628; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 6629; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 6630; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6631; AVX2-NEXT: vmovdqa 64(%rcx), %xmm1 6632; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6633; AVX2-NEXT: vmovdqa 64(%rdx), %xmm2 6634; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6635; AVX2-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 6636; AVX2-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 6637; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 6638; AVX2-NEXT: vmovdqa 64(%rsi), %xmm9 6639; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,1] 6640; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] 6641; AVX2-NEXT: vmovdqa 64(%rdi), %xmm3 6642; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6643; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 6644; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] 6645; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 6646; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 6647; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 6648; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 6649; AVX2-NEXT: vmovdqa 64(%r8), %xmm2 6650; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6651; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] 6652; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 6653; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 6654; AVX2-NEXT: vmovdqa 64(%r9), %xmm2 6655; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6656; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] 6657; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] 6658; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 6659; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 6660; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6661; AVX2-NEXT: vmovdqa 96(%rcx), %xmm1 6662; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6663; AVX2-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 6664; AVX2-NEXT: vmovdqa 96(%rdx), %xmm2 6665; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6666; AVX2-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 6667; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 6668; AVX2-NEXT: vmovdqa 96(%rsi), %xmm2 6669; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6670; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 6671; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] 6672; AVX2-NEXT: vmovdqa 96(%rdi), %xmm3 6673; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6674; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 6675; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] 6676; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 6677; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 6678; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 6679; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 6680; AVX2-NEXT: vmovdqa 96(%r8), %xmm2 6681; AVX2-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill 6682; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] 6683; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 6684; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 6685; AVX2-NEXT: vmovdqa 96(%r9), %xmm2 6686; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6687; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] 6688; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] 6689; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 6690; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 6691; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6692; AVX2-NEXT: vmovdqa (%rdx), %ymm2 6693; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6694; AVX2-NEXT: vmovdqa (%rcx), %ymm1 6695; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6696; AVX2-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 6697; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 6698; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] 6699; AVX2-NEXT: vmovdqa (%rsi), %ymm2 6700; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6701; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] 6702; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 6703; AVX2-NEXT: vmovdqa (%rdi), %ymm3 6704; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6705; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,2,3,6,5,6,7] 6706; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 6707; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 6708; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 6709; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 6710; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 6711; AVX2-NEXT: vmovdqa (%r8), %ymm2 6712; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6713; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 6714; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 6715; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 6716; AVX2-NEXT: vmovdqa (%r9), %ymm2 6717; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6718; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 6719; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 6720; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 6721; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 6722; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6723; AVX2-NEXT: vmovdqa 32(%rdx), %ymm2 6724; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6725; AVX2-NEXT: vmovdqa 32(%rcx), %ymm1 6726; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6727; AVX2-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 6728; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 6729; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] 6730; AVX2-NEXT: vmovdqa 32(%rsi), %ymm7 6731; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,1,2,3,6,5,6,7] 6732; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6733; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 6734; AVX2-NEXT: vmovdqa 32(%rdi), %ymm15 6735; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,2,3,6,5,6,7] 6736; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6737; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 6738; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 6739; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 6740; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 6741; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 6742; AVX2-NEXT: vmovdqa 32(%r8), %ymm2 6743; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6744; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 6745; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 6746; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 6747; AVX2-NEXT: vmovdqa 32(%r9), %ymm2 6748; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6749; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 6750; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 6751; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 6752; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 6753; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6754; AVX2-NEXT: vmovdqa 64(%rdx), %ymm2 6755; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6756; AVX2-NEXT: vmovdqa 64(%rcx), %ymm1 6757; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6758; AVX2-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 6759; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 6760; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] 6761; AVX2-NEXT: vmovdqa 64(%rsi), %ymm2 6762; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6763; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] 6764; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 6765; AVX2-NEXT: vmovdqa 64(%rdi), %ymm3 6766; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6767; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,2,3,6,5,6,7] 6768; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 6769; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 6770; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 6771; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 6772; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 6773; AVX2-NEXT: vmovdqa 64(%r8), %ymm2 6774; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6775; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 6776; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 6777; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 6778; AVX2-NEXT: vmovdqa 64(%r9), %ymm2 6779; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6780; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 6781; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 6782; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 6783; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 6784; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6785; AVX2-NEXT: vmovdqa 96(%rdx), %ymm3 6786; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6787; AVX2-NEXT: vmovdqa 96(%rcx), %ymm1 6788; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6789; AVX2-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 6790; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 6791; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] 6792; AVX2-NEXT: vmovdqa 96(%rsi), %ymm6 6793; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,1,2,3,6,5,6,7] 6794; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6795; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 6796; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 6797; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6798; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,2,3,6,5,6,7] 6799; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 6800; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 6801; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 6802; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 6803; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 6804; AVX2-NEXT: vmovdqa 96(%r8), %ymm8 6805; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 6806; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6807; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 6808; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 6809; AVX2-NEXT: vmovdqa 96(%r9), %ymm2 6810; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6811; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 6812; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 6813; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 6814; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 6815; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6816; AVX2-NEXT: vmovdqa %xmm14, %xmm4 6817; AVX2-NEXT: vmovdqa %xmm10, %xmm3 6818; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] 6819; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 6820; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] 6821; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] 6822; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 6823; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 6824; AVX2-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 6825; AVX2-NEXT: # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 6826; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 6827; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 6828; AVX2-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6829; AVX2-NEXT: # xmm0 = mem[0,0,2,1,4,5,6,7] 6830; AVX2-NEXT: vpbroadcastq %xmm0, %ymm2 6831; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] 6832; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 6833; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6834; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] 6835; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 6836; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6837; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload 6838; AVX2-NEXT: # xmm2 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] 6839; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] 6840; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 6841; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] 6842; AVX2-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 6843; AVX2-NEXT: # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 6844; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 6845; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 6846; AVX2-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 6847; AVX2-NEXT: # xmm2 = mem[0,0,2,1,4,5,6,7] 6848; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 6849; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 6850; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6851; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6852; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] 6853; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 6854; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6855; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 6856; AVX2-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 6857; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] 6858; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 6859; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] 6860; AVX2-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 6861; AVX2-NEXT: # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 6862; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 6863; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 6864; AVX2-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 6865; AVX2-NEXT: # xmm2 = mem[0,0,2,1,4,5,6,7] 6866; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 6867; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 6868; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6869; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6870; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 6871; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 6872; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 6873; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6874; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 6875; AVX2-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 6876; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] 6877; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 6878; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] 6879; AVX2-NEXT: vpmovzxwd (%rsp), %xmm2 # 16-byte Folded Reload 6880; AVX2-NEXT: # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 6881; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 6882; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 6883; AVX2-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 6884; AVX2-NEXT: # xmm2 = mem[0,0,2,1,4,5,6,7] 6885; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 6886; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 6887; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6888; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6889; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 6890; AVX2-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] 6891; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 6892; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6893; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 6894; AVX2-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11] 6895; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] 6896; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 6897; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] 6898; AVX2-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 6899; AVX2-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 6900; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 6901; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 6902; AVX2-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 6903; AVX2-NEXT: # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 6904; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 6905; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 6906; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6907; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm7[0],ymm15[1],ymm7[1],ymm15[2],ymm7[2],ymm15[3],ymm7[3],ymm15[8],ymm7[8],ymm15[9],ymm7[9],ymm15[10],ymm7[10],ymm15[11],ymm7[11] 6908; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 6909; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6910; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 6911; AVX2-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11] 6912; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] 6913; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 6914; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] 6915; AVX2-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 6916; AVX2-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 6917; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 6918; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 6919; AVX2-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 6920; AVX2-NEXT: # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 6921; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 6922; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 6923; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6924; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 6925; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 6926; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] 6927; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 6928; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 6929; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 6930; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] 6931; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] 6932; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 6933; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] 6934; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 6935; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 6936; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 6937; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 6938; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 6939; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 6940; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 6941; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 6942; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6943; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 6944; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] 6945; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 6946; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6947; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6948; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] 6949; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] 6950; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 6951; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] 6952; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 6953; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 6954; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 6955; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 6956; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 6957; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 6958; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 6959; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6960; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 6961; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6962; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 6963; AVX2-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] 6964; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] 6965; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] 6966; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 6967; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 6968; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] 6969; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6970; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm2 6971; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 6972; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] 6973; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 6974; AVX2-NEXT: # xmm2 = mem[2,3,2,3] 6975; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] 6976; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,1] 6977; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 6978; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 6979; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6980; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6981; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 6982; AVX2-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 6983; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload 6984; AVX2-NEXT: # xmm3 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] 6985; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] 6986; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,3,3] 6987; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 6988; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] 6989; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6990; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm3 6991; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] 6992; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] 6993; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 6994; AVX2-NEXT: # xmm3 = mem[2,3,2,3] 6995; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7] 6996; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] 6997; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 6998; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6999; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7000; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 7001; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7002; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload 7003; AVX2-NEXT: # xmm3 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] 7004; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] 7005; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,3,3] 7006; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 7007; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] 7008; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7009; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm3 7010; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] 7011; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] 7012; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7013; AVX2-NEXT: # xmm3 = mem[2,3,2,3] 7014; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7] 7015; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] 7016; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 7017; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7018; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7019; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 7020; AVX2-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 7021; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7022; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload 7023; AVX2-NEXT: # xmm3 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] 7024; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] 7025; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,3,3] 7026; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 7027; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] 7028; AVX2-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload 7029; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm1 7030; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7031; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 7032; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7033; AVX2-NEXT: # xmm1 = mem[2,3,2,3] 7034; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7] 7035; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7036; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm9 7037; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7038; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7039; AVX2-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] 7040; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7041; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 7042; AVX2-NEXT: # ymm1 = ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[12],mem[12],ymm1[13],mem[13],ymm1[14],mem[14],ymm1[15],mem[15] 7043; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] 7044; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] 7045; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 7046; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 7047; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] 7048; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7049; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm3 7050; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] 7051; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] 7052; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 7053; AVX2-NEXT: # ymm3 = mem[2,3,2,3,6,7,6,7] 7054; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 7055; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] 7056; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm5 7057; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7058; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 7059; AVX2-NEXT: # ymm1 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] 7060; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7061; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7062; AVX2-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] 7063; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] 7064; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] 7065; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] 7066; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 7067; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7068; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 7069; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 7070; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 7071; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload 7072; AVX2-NEXT: # ymm1 = mem[2,3,2,3,6,7,6,7] 7073; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 7074; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 7075; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm3 7076; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] 7077; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] 7078; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] 7079; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] 7080; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 7081; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 7082; AVX2-NEXT: vpshufb %ymm2, %ymm14, %ymm1 7083; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 7084; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 7085; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,3,2,3,6,7,6,7] 7086; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 7087; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 7088; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 7089; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload 7090; AVX2-NEXT: # ymm1 = ymm7[4],mem[4],ymm7[5],mem[5],ymm7[6],mem[6],ymm7[7],mem[7],ymm7[12],mem[12],ymm7[13],mem[13],ymm7[14],mem[14],ymm7[15],mem[15] 7091; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 7092; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] 7093; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] 7094; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7] 7095; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] 7096; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7] 7097; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 7098; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm2 7099; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 7100; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] 7101; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,3,2,3,6,7,6,7] 7102; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 7103; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 7104; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 7105; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 7106; AVX2-NEXT: vmovdqa %ymm1, 736(%rax) 7107; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7108; AVX2-NEXT: vmovaps %ymm1, 672(%rax) 7109; AVX2-NEXT: vmovdqa %ymm0, 544(%rax) 7110; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7111; AVX2-NEXT: vmovaps %ymm0, 480(%rax) 7112; AVX2-NEXT: vmovdqa %ymm3, 352(%rax) 7113; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7114; AVX2-NEXT: vmovaps %ymm0, 288(%rax) 7115; AVX2-NEXT: vmovdqa %ymm5, 160(%rax) 7116; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7117; AVX2-NEXT: vmovaps %ymm0, 96(%rax) 7118; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7119; AVX2-NEXT: vmovaps %ymm0, 704(%rax) 7120; AVX2-NEXT: vmovdqa %ymm9, 640(%rax) 7121; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7122; AVX2-NEXT: vmovaps %ymm0, 576(%rax) 7123; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7124; AVX2-NEXT: vmovaps %ymm0, 512(%rax) 7125; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7126; AVX2-NEXT: vmovaps %ymm0, 448(%rax) 7127; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7128; AVX2-NEXT: vmovaps %ymm0, 384(%rax) 7129; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7130; AVX2-NEXT: vmovaps %ymm0, 320(%rax) 7131; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7132; AVX2-NEXT: vmovaps %ymm0, 256(%rax) 7133; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7134; AVX2-NEXT: vmovaps %ymm0, 192(%rax) 7135; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7136; AVX2-NEXT: vmovaps %ymm0, 128(%rax) 7137; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7138; AVX2-NEXT: vmovaps %ymm0, 64(%rax) 7139; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7140; AVX2-NEXT: vmovaps %ymm0, (%rax) 7141; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7142; AVX2-NEXT: vmovaps %ymm0, 608(%rax) 7143; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7144; AVX2-NEXT: vmovaps %ymm0, 416(%rax) 7145; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7146; AVX2-NEXT: vmovaps %ymm0, 224(%rax) 7147; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7148; AVX2-NEXT: vmovaps %ymm0, 32(%rax) 7149; AVX2-NEXT: addq $1544, %rsp # imm = 0x608 7150; AVX2-NEXT: vzeroupper 7151; AVX2-NEXT: retq 7152; 7153; AVX2-FP-LABEL: store_i16_stride6_vf64: 7154; AVX2-FP: # %bb.0: 7155; AVX2-FP-NEXT: subq $1544, %rsp # imm = 0x608 7156; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm0 7157; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7158; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm5 7159; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1 7160; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7161; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4 7162; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7163; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 7164; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 7165; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm1 7166; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7167; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm11 7168; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2 7169; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7170; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm6 7171; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7172; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 7173; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] 7174; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7175; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 7176; AVX2-FP-NEXT: vmovdqa (%r8), %xmm1 7177; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7178; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm2 7179; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7180; AVX2-FP-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 7181; AVX2-FP-NEXT: vpbroadcastq %xmm1, %ymm1 7182; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 7183; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1 7184; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7185; AVX2-FP-NEXT: vmovdqa 32(%r9), %xmm3 7186; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7187; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] 7188; AVX2-FP-NEXT: vpbroadcastq %xmm1, %ymm1 7189; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] 7190; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 7191; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7192; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 7193; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7194; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 7195; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] 7196; AVX2-FP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7197; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] 7198; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7199; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 7200; AVX2-FP-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 7201; AVX2-FP-NEXT: vpbroadcastq %xmm1, %ymm1 7202; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 7203; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,2,1,4,5,6,7] 7204; AVX2-FP-NEXT: vpbroadcastq %xmm1, %ymm1 7205; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 7206; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7207; AVX2-FP-NEXT: vmovdqa 64(%rsi), %xmm12 7208; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm0 7209; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7210; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] 7211; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7212; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 7213; AVX2-FP-NEXT: vmovdqa 64(%rcx), %xmm1 7214; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill 7215; AVX2-FP-NEXT: vmovdqa 64(%rdx), %xmm2 7216; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7217; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 7218; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] 7219; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7220; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 7221; AVX2-FP-NEXT: vmovdqa 64(%r8), %xmm1 7222; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7223; AVX2-FP-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 7224; AVX2-FP-NEXT: vpbroadcastq %xmm1, %ymm1 7225; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 7226; AVX2-FP-NEXT: vmovdqa 64(%r9), %xmm1 7227; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7228; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] 7229; AVX2-FP-NEXT: vpbroadcastq %xmm1, %ymm1 7230; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 7231; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7232; AVX2-FP-NEXT: vmovdqa 96(%rsi), %xmm6 7233; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm0 7234; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7235; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] 7236; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7237; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 7238; AVX2-FP-NEXT: vmovdqa 96(%rcx), %xmm1 7239; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7240; AVX2-FP-NEXT: vmovdqa 96(%rdx), %xmm2 7241; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7242; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 7243; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] 7244; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7245; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 7246; AVX2-FP-NEXT: vmovdqa 96(%r8), %xmm1 7247; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7248; AVX2-FP-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 7249; AVX2-FP-NEXT: vpbroadcastq %xmm1, %ymm1 7250; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 7251; AVX2-FP-NEXT: vmovdqa 96(%r9), %xmm1 7252; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7253; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] 7254; AVX2-FP-NEXT: vpbroadcastq %xmm1, %ymm1 7255; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 7256; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7257; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm14 7258; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm0 7259; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7260; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[8],ymm0[8],ymm14[9],ymm0[9],ymm14[10],ymm0[10],ymm14[11],ymm0[11] 7261; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] 7262; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm13 7263; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm9 7264; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm9[0],ymm13[1],ymm9[1],ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[8],ymm9[8],ymm13[9],ymm9[9],ymm13[10],ymm9[10],ymm13[11],ymm9[11] 7265; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7266; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7267; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] 7268; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 7269; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 7270; AVX2-FP-NEXT: vmovdqa (%r8), %ymm1 7271; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7272; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 7273; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 7274; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 7275; AVX2-FP-NEXT: vmovdqa (%r9), %ymm1 7276; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7277; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 7278; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 7279; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 7280; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7281; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 7282; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7283; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm0 7284; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7285; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 7286; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] 7287; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm15 7288; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm1 7289; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7290; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[8],ymm1[8],ymm15[9],ymm1[9],ymm15[10],ymm1[10],ymm15[11],ymm1[11] 7291; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] 7292; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 7293; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 7294; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm1 7295; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7296; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 7297; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 7298; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 7299; AVX2-FP-NEXT: vmovdqa 32(%r9), %ymm1 7300; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7301; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 7302; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 7303; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 7304; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7305; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm1 7306; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7307; AVX2-FP-NEXT: vmovdqa 64(%rsi), %ymm0 7308; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7309; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 7310; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] 7311; AVX2-FP-NEXT: vmovdqa 64(%rdx), %ymm8 7312; AVX2-FP-NEXT: vmovdqa 64(%rcx), %ymm1 7313; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7314; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] 7315; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7316; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] 7317; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 7318; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 7319; AVX2-FP-NEXT: vmovdqa 64(%r8), %ymm1 7320; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7321; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 7322; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 7323; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 7324; AVX2-FP-NEXT: vmovdqa 64(%r9), %ymm1 7325; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7326; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 7327; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 7328; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 7329; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7330; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm0 7331; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7332; AVX2-FP-NEXT: vmovdqa 96(%rsi), %ymm7 7333; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[8],ymm7[8],ymm0[9],ymm7[9],ymm0[10],ymm7[10],ymm0[11],ymm7[11] 7334; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7335; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] 7336; AVX2-FP-NEXT: vmovdqa 96(%rdx), %ymm2 7337; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7338; AVX2-FP-NEXT: vmovdqa 96(%rcx), %ymm1 7339; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7340; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] 7341; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] 7342; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 7343; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 7344; AVX2-FP-NEXT: vmovdqa 96(%r8), %ymm1 7345; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7346; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 7347; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 7348; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 7349; AVX2-FP-NEXT: vmovdqa 96(%r9), %ymm1 7350; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7351; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 7352; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 7353; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 7354; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7355; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] 7356; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7357; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 7358; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7359; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 7360; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 7361; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7362; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 7363; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7364; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 7365; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 7366; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7367; AVX2-FP-NEXT: vpbroadcastq %xmm2, %ymm2 7368; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 7369; AVX2-FP-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7370; AVX2-FP-NEXT: # xmm2 = mem[2,1,3,3,4,5,6,7] 7371; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 7372; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 7373; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] 7374; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7375; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 7376; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 7377; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] 7378; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 7379; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7380; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm2 7381; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7382; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 7383; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 7384; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 7385; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7386; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 7387; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] 7388; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 7389; AVX2-FP-NEXT: vpbroadcastq %xmm3, %ymm3 7390; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 7391; AVX2-FP-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7392; AVX2-FP-NEXT: # xmm3 = mem[2,1,3,3,4,5,6,7] 7393; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 7394; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] 7395; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7396; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 7397; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 7398; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 7399; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7400; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm2 7401; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7402; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 7403; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 7404; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload 7405; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 7406; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7407; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 7408; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] 7409; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 7410; AVX2-FP-NEXT: vpbroadcastq %xmm3, %ymm3 7411; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 7412; AVX2-FP-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7413; AVX2-FP-NEXT: # xmm3 = mem[2,1,3,3,4,5,6,7] 7414; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 7415; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] 7416; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7417; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 7418; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 7419; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 7420; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7421; AVX2-FP-NEXT: vpshufb %xmm0, %xmm6, %xmm2 7422; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7423; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 7424; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 7425; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7426; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 7427; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7428; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 7429; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 7430; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 7431; AVX2-FP-NEXT: vpbroadcastq %xmm2, %ymm2 7432; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] 7433; AVX2-FP-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7434; AVX2-FP-NEXT: # xmm2 = mem[2,1,3,3,4,5,6,7] 7435; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 7436; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] 7437; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7438; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 7439; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 7440; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 7441; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7442; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] 7443; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7444; AVX2-FP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 7445; AVX2-FP-NEXT: vmovdqa %ymm14, %ymm6 7446; AVX2-FP-NEXT: vpshufb %ymm0, %ymm14, %ymm2 7447; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] 7448; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 7449; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 7450; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 7451; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 7452; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 7453; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 7454; AVX2-FP-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 7455; AVX2-FP-NEXT: # ymm2 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 7456; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 7457; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 7458; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] 7459; AVX2-FP-NEXT: # ymm1 = mem[0,1,0,1] 7460; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7461; AVX2-FP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 7462; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] 7463; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 7464; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7465; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7466; AVX2-FP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 7467; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7468; AVX2-FP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 7469; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 7470; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7471; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 7472; AVX2-FP-NEXT: vmovdqa %ymm15, %ymm13 7473; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm5 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 7474; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] 7475; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 7476; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] 7477; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 7478; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 7479; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 7480; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] 7481; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] 7482; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 7483; AVX2-FP-NEXT: vpshufb %ymm1, %ymm15, %ymm3 7484; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] 7485; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 7486; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7487; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7488; AVX2-FP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 7489; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7490; AVX2-FP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 7491; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 7492; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 7493; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 7494; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm5 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 7495; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] 7496; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 7497; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] 7498; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 7499; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 7500; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 7501; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] 7502; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] 7503; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7504; AVX2-FP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 7505; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] 7506; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 7507; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7508; AVX2-FP-NEXT: vpshufb %ymm0, %ymm7, %ymm2 7509; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 7510; AVX2-FP-NEXT: vpshufb %ymm0, %ymm10, %ymm0 7511; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 7512; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7513; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 7514; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 7515; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 7516; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 7517; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 7518; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 7519; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] 7520; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 7521; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 7522; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 7523; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] 7524; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 7525; AVX2-FP-NEXT: vpshufb %ymm1, %ymm9, %ymm1 7526; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 7527; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 7528; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7529; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7530; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 7531; AVX2-FP-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 7532; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7533; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 7534; AVX2-FP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] 7535; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] 7536; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] 7537; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 7538; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 7539; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] 7540; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7541; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 7542; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 7543; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] 7544; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] 7545; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7546; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 7547; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,1] 7548; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 7549; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 7550; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7551; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7552; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload 7553; AVX2-FP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 7554; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7555; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload 7556; AVX2-FP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 7557; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] 7558; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3] 7559; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] 7560; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 7561; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7562; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm4 7563; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 7564; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] 7565; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7566; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm4 7567; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 7568; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 7569; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7570; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7571; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload 7572; AVX2-FP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 7573; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7574; AVX2-FP-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload 7575; AVX2-FP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 7576; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] 7577; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3] 7578; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] 7579; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 7580; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7581; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm4 7582; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 7583; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] 7584; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7585; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm4 7586; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 7587; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 7588; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7589; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7590; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload 7591; AVX2-FP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 7592; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7593; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload 7594; AVX2-FP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 7595; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] 7596; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3] 7597; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] 7598; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 7599; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7600; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm1 7601; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7602; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] 7603; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7604; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm2 7605; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 7606; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm0 7607; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7608; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload 7609; AVX2-FP-NEXT: # ymm1 = ymm6[4],mem[4],ymm6[5],mem[5],ymm6[6],mem[6],ymm6[7],mem[7],ymm6[12],mem[12],ymm6[13],mem[13],ymm6[14],mem[14],ymm6[15],mem[15] 7610; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7611; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 7612; AVX2-FP-NEXT: # ymm2 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] 7613; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] 7614; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] 7615; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 7616; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 7617; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] 7618; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7619; AVX2-FP-NEXT: vpshufb %ymm6, %ymm0, %ymm3 7620; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] 7621; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] 7622; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] 7623; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7624; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm4 7625; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] 7626; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm4 7627; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7628; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 7629; AVX2-FP-NEXT: # ymm3 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] 7630; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload 7631; AVX2-FP-NEXT: # ymm0 = ymm13[4],mem[4],ymm13[5],mem[5],ymm13[6],mem[6],ymm13[7],mem[7],ymm13[12],mem[12],ymm13[13],mem[13],ymm13[14],mem[14],ymm13[15],mem[15] 7632; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] 7633; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] 7634; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] 7635; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] 7636; AVX2-FP-NEXT: vpshufb %ymm6, %ymm14, %ymm3 7637; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] 7638; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] 7639; AVX2-FP-NEXT: vpshufb %ymm2, %ymm15, %ymm3 7640; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] 7641; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm3 7642; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7643; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7644; AVX2-FP-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] 7645; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7646; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm12[4],ymm1[5],ymm12[5],ymm1[6],ymm12[6],ymm1[7],ymm12[7],ymm1[12],ymm12[12],ymm1[13],ymm12[13],ymm1[14],ymm12[14],ymm1[15],ymm12[15] 7647; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] 7648; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] 7649; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 7650; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 7651; AVX2-FP-NEXT: vpshufb %ymm6, %ymm11, %ymm1 7652; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 7653; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 7654; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7655; AVX2-FP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 7656; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 7657; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 7658; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload 7659; AVX2-FP-NEXT: # ymm1 = ymm10[4],mem[4],ymm10[5],mem[5],ymm10[6],mem[6],ymm10[7],mem[7],ymm10[12],mem[12],ymm10[13],mem[13],ymm10[14],mem[14],ymm10[15],mem[15] 7660; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload 7661; AVX2-FP-NEXT: # ymm11 = ymm7[4],mem[4],ymm7[5],mem[5],ymm7[6],mem[6],ymm7[7],mem[7],ymm7[12],mem[12],ymm7[13],mem[13],ymm7[14],mem[14],ymm7[15],mem[15] 7662; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] 7663; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7] 7664; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] 7665; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7] 7666; AVX2-FP-NEXT: vpshufb %ymm6, %ymm8, %ymm6 7667; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] 7668; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7] 7669; AVX2-FP-NEXT: vpshufb %ymm2, %ymm9, %ymm2 7670; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 7671; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 7672; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 7673; AVX2-FP-NEXT: vmovdqa %ymm1, 736(%rax) 7674; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7675; AVX2-FP-NEXT: vmovaps %ymm1, 704(%rax) 7676; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7677; AVX2-FP-NEXT: vmovaps %ymm1, 672(%rax) 7678; AVX2-FP-NEXT: vmovdqa %ymm0, 544(%rax) 7679; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7680; AVX2-FP-NEXT: vmovaps %ymm0, 512(%rax) 7681; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7682; AVX2-FP-NEXT: vmovaps %ymm0, 480(%rax) 7683; AVX2-FP-NEXT: vmovdqa %ymm3, 352(%rax) 7684; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7685; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rax) 7686; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7687; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rax) 7688; AVX2-FP-NEXT: vmovdqa %ymm4, 160(%rax) 7689; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7690; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rax) 7691; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7692; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rax) 7693; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7694; AVX2-FP-NEXT: vmovaps %ymm0, 640(%rax) 7695; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7696; AVX2-FP-NEXT: vmovaps %ymm0, 608(%rax) 7697; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7698; AVX2-FP-NEXT: vmovaps %ymm0, 576(%rax) 7699; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7700; AVX2-FP-NEXT: vmovaps %ymm0, 448(%rax) 7701; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7702; AVX2-FP-NEXT: vmovaps %ymm0, 416(%rax) 7703; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7704; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rax) 7705; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7706; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rax) 7707; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7708; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax) 7709; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7710; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rax) 7711; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7712; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax) 7713; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7714; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) 7715; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7716; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) 7717; AVX2-FP-NEXT: addq $1544, %rsp # imm = 0x608 7718; AVX2-FP-NEXT: vzeroupper 7719; AVX2-FP-NEXT: retq 7720; 7721; AVX2-FCP-LABEL: store_i16_stride6_vf64: 7722; AVX2-FCP: # %bb.0: 7723; AVX2-FCP-NEXT: subq $1560, %rsp # imm = 0x618 7724; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 7725; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7726; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm5 7727; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7728; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] 7729; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 7730; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2 7731; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7732; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm7 7733; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7734; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 7735; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 7736; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7737; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm2 7738; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7739; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm8 7740; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7741; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 7742; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm3 7743; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7744; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm14 7745; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 7746; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 7747; AVX2-FCP-NEXT: vpbroadcastq %xmm2, %ymm2 7748; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 7749; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 7750; AVX2-FCP-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill 7751; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] 7752; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 7753; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 7754; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm1 7755; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7756; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] 7757; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 7758; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,2,1] 7759; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] 7760; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 7761; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7762; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm3 7763; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm4 7764; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 7765; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm4 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 7766; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm5 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 7767; AVX2-FCP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7768; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 7769; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] 7770; AVX2-FCP-NEXT: vpbroadcastq %xmm4, %ymm4 7771; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 7772; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm4 7773; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7774; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] 7775; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] 7776; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] 7777; AVX2-FCP-NEXT: vmovdqa 32(%r9), %xmm4 7778; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7779; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm4 7780; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] 7781; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 7782; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7783; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 7784; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7785; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 7786; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7787; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 7788; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 7789; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 7790; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %xmm4 7791; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7792; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %xmm5 7793; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7794; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 7795; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 7796; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 7797; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] 7798; AVX2-FCP-NEXT: vpbroadcastq %xmm4, %ymm4 7799; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 7800; AVX2-FCP-NEXT: vmovdqa 64(%r8), %xmm4 7801; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7802; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] 7803; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] 7804; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] 7805; AVX2-FCP-NEXT: vmovdqa 64(%r9), %xmm4 7806; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7807; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm4 7808; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] 7809; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 7810; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7811; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %xmm3 7812; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7813; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 7814; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm4 7815; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7816; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm0 7817; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 7818; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %xmm13 7819; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 7820; AVX2-FCP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7821; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %xmm12 7822; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm4 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 7823; AVX2-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7824; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 7825; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 7826; AVX2-FCP-NEXT: vpbroadcastq %xmm3, %ymm3 7827; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] 7828; AVX2-FCP-NEXT: vmovdqa 96(%r8), %xmm3 7829; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7830; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] 7831; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 7832; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] 7833; AVX2-FCP-NEXT: vmovdqa 96(%r9), %xmm3 7834; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7835; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm2 7836; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 7837; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 7838; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7839; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 7840; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7841; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm2 7842; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7843; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] 7844; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 7845; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 7846; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 7847; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm4 7848; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7849; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm3 7850; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7851; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 7852; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 7853; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] 7854; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 7855; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] 7856; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 7857; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm3 7858; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7859; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 7860; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] 7861; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] 7862; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm4 7863; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7864; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] 7865; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] 7866; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4 7867; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] 7868; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 7869; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7870; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 7871; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7872; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 7873; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7874; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 7875; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm4 7876; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] 7877; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm10 7878; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm11 7879; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 7880; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7881; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm5 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 7882; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7883; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] 7884; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] 7885; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] 7886; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 7887; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm4 7888; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7889; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 7890; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] 7891; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] 7892; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm4 7893; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7894; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4 7895; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] 7896; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 7897; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7898; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm5 7899; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7900; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %ymm9 7901; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm3 7902; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm4 7903; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] 7904; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %ymm15 7905; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %ymm4 7906; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7907; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 7908; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm5 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 7909; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7910; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] 7911; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] 7912; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] 7913; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 7914; AVX2-FCP-NEXT: vmovdqa 64(%r8), %ymm4 7915; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7916; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 7917; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] 7918; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] 7919; AVX2-FCP-NEXT: vmovdqa 64(%r9), %ymm4 7920; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7921; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4 7922; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] 7923; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 7924; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7925; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 7926; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7927; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %ymm3 7928; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7929; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 7930; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 7931; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] 7932; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %ymm8 7933; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %ymm7 7934; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 7935; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7936; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 7937; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7938; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] 7939; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 7940; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] 7941; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] 7942; AVX2-FCP-NEXT: vmovdqa 96(%r8), %ymm6 7943; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 7944; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7945; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] 7946; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] 7947; AVX2-FCP-NEXT: vmovdqa 96(%r9), %ymm3 7948; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7949; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 7950; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 7951; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 7952; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7953; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 7954; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7955; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 7956; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,0,2,2,1,0,2,2] 7957; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] 7958; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 7959; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7960; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 7961; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 7962; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 7963; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] 7964; AVX2-FCP-NEXT: vpmovzxwd (%rsp), %xmm2 # 16-byte Folded Reload 7965; AVX2-FCP-NEXT: # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 7966; AVX2-FCP-NEXT: vpbroadcastq %xmm2, %ymm2 7967; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] 7968; AVX2-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7969; AVX2-FCP-NEXT: # xmm0 = mem[0,0,2,1,4,5,6,7] 7970; AVX2-FCP-NEXT: vpbroadcastq %xmm0, %ymm3 7971; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] 7972; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 7973; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7974; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload 7975; AVX2-FCP-NEXT: # xmm2 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] 7976; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2 7977; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7978; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload 7979; AVX2-FCP-NEXT: # xmm3 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] 7980; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 7981; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] 7982; AVX2-FCP-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7983; AVX2-FCP-NEXT: # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 7984; AVX2-FCP-NEXT: vpbroadcastq %xmm3, %ymm3 7985; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 7986; AVX2-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7987; AVX2-FCP-NEXT: # xmm3 = mem[0,0,2,1,4,5,6,7] 7988; AVX2-FCP-NEXT: vpbroadcastq %xmm3, %ymm3 7989; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 7990; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7991; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7992; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 7993; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 7994; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2 7995; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7996; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 7997; AVX2-FCP-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] 7998; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 7999; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] 8000; AVX2-FCP-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 8001; AVX2-FCP-NEXT: # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 8002; AVX2-FCP-NEXT: vpbroadcastq %xmm3, %ymm3 8003; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 8004; AVX2-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 8005; AVX2-FCP-NEXT: # xmm3 = mem[0,0,2,1,4,5,6,7] 8006; AVX2-FCP-NEXT: vpbroadcastq %xmm3, %ymm3 8007; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 8008; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8009; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] 8010; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 8011; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8012; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 8013; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 8014; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 8015; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] 8016; AVX2-FCP-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 8017; AVX2-FCP-NEXT: # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 8018; AVX2-FCP-NEXT: vpbroadcastq %xmm2, %ymm2 8019; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 8020; AVX2-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 8021; AVX2-FCP-NEXT: # xmm2 = mem[0,0,2,1,4,5,6,7] 8022; AVX2-FCP-NEXT: vpbroadcastq %xmm2, %ymm2 8023; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 8024; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8025; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8026; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload 8027; AVX2-FCP-NEXT: # ymm2 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] 8028; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,4,2,2,5,4,6,6] 8029; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2 8030; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 8031; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 8032; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] 8033; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] 8034; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] 8035; AVX2-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 8036; AVX2-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 8037; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] 8038; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 8039; AVX2-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 8040; AVX2-FCP-NEXT: # ymm3 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 8041; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] 8042; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 8043; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8044; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] 8045; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2 8046; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 8047; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 8048; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] 8049; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] 8050; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] 8051; AVX2-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 8052; AVX2-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 8053; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] 8054; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 8055; AVX2-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 8056; AVX2-FCP-NEXT: # ymm3 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 8057; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] 8058; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 8059; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8060; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 8061; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] 8062; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2 8063; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm12 8064; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 8065; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11] 8066; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] 8067; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] 8068; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 8069; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 8070; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] 8071; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 8072; AVX2-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 8073; AVX2-FCP-NEXT: # ymm3 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 8074; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] 8075; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 8076; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8077; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] 8078; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 8079; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 8080; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 8081; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] 8082; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 8083; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] 8084; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 8085; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 8086; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 8087; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 8088; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 8089; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 8090; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 8091; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8092; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 8093; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8094; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 8095; AVX2-FCP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] 8096; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,2,1,2,0,0,3,3] 8097; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 8098; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] 8099; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 8100; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] 8101; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 8102; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 8103; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 8104; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 8105; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] 8106; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8107; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 8108; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,1,0,1] 8109; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 8110; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 8111; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8112; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8113; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload 8114; AVX2-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 8115; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload 8116; AVX2-FCP-NEXT: # xmm5 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] 8117; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm4 8118; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] 8119; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] 8120; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8121; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm5 8122; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 8123; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] 8124; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8125; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm5 8126; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 8127; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 8128; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8129; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8130; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload 8131; AVX2-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 8132; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8133; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload 8134; AVX2-FCP-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 8135; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm4 8136; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] 8137; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] 8138; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8139; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm5 8140; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 8141; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] 8142; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8143; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm5 8144; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 8145; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 8146; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8147; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8148; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload 8149; AVX2-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 8150; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm3 8151; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8152; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload 8153; AVX2-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 8154; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] 8155; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 8156; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8157; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm2 8158; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 8159; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] 8160; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8161; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm1 8162; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 8163; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm10 8164; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8165; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 8166; AVX2-FCP-NEXT: # ymm1 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] 8167; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8168; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 8169; AVX2-FCP-NEXT: # ymm2 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] 8170; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [5,6,5,6,5,6,7,7] 8171; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 8172; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] 8173; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 8174; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] 8175; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8176; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm4 8177; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] 8178; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7] 8179; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] 8180; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8181; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm14 8182; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 8183; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm14, %ymm5 8184; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8185; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 8186; AVX2-FCP-NEXT: # ymm14 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] 8187; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8188; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8189; AVX2-FCP-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] 8190; AVX2-FCP-NEXT: vpermd %ymm14, %ymm3, %ymm14 8191; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] 8192; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] 8193; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 8194; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm14 8195; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 8196; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6],ymm14[7] 8197; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 8198; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm14 8199; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 8200; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm14, %ymm4 8201; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8202; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm0[4],ymm13[4],ymm0[5],ymm13[5],ymm0[6],ymm13[6],ymm0[7],ymm13[7],ymm0[12],ymm13[12],ymm0[13],ymm13[13],ymm0[14],ymm13[14],ymm0[15],ymm13[15] 8203; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] 8204; AVX2-FCP-NEXT: vpermd %ymm14, %ymm3, %ymm14 8205; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] 8206; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] 8207; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm14 8208; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 8209; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6],ymm14[7] 8210; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 8211; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm14 8212; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 8213; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm14, %ymm0 8214; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 8215; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload 8216; AVX2-FCP-NEXT: # ymm14 = ymm11[4],mem[4],ymm11[5],mem[5],ymm11[6],mem[6],ymm11[7],mem[7],ymm11[12],mem[12],ymm11[13],mem[13],ymm11[14],mem[14],ymm11[15],mem[15] 8217; AVX2-FCP-NEXT: vpermd %ymm14, %ymm3, %ymm3 8218; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] 8219; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[3,3,3,3] 8220; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7] 8221; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 8222; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm2 8223; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] 8224; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] 8225; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm1 8226; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 8227; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 8228; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 8229; AVX2-FCP-NEXT: vmovdqa %ymm1, 736(%rax) 8230; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8231; AVX2-FCP-NEXT: vmovaps %ymm1, 672(%rax) 8232; AVX2-FCP-NEXT: vmovdqa %ymm0, 544(%rax) 8233; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8234; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%rax) 8235; AVX2-FCP-NEXT: vmovdqa %ymm4, 352(%rax) 8236; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8237; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rax) 8238; AVX2-FCP-NEXT: vmovdqa %ymm5, 160(%rax) 8239; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8240; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rax) 8241; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8242; AVX2-FCP-NEXT: vmovaps %ymm0, 704(%rax) 8243; AVX2-FCP-NEXT: vmovdqa %ymm10, 640(%rax) 8244; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8245; AVX2-FCP-NEXT: vmovaps %ymm0, 576(%rax) 8246; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8247; AVX2-FCP-NEXT: vmovaps %ymm0, 512(%rax) 8248; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8249; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%rax) 8250; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8251; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rax) 8252; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8253; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rax) 8254; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8255; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rax) 8256; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8257; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rax) 8258; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8259; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rax) 8260; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8261; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax) 8262; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8263; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) 8264; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8265; AVX2-FCP-NEXT: vmovaps %ymm0, 608(%rax) 8266; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8267; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%rax) 8268; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8269; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax) 8270; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8271; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) 8272; AVX2-FCP-NEXT: addq $1560, %rsp # imm = 0x618 8273; AVX2-FCP-NEXT: vzeroupper 8274; AVX2-FCP-NEXT: retq 8275; 8276; AVX512-LABEL: store_i16_stride6_vf64: 8277; AVX512: # %bb.0: 8278; AVX512-NEXT: subq $392, %rsp # imm = 0x188 8279; AVX512-NEXT: vmovdqa 96(%rcx), %ymm9 8280; AVX512-NEXT: vpsrldq {{.*#+}} ymm0 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 8281; AVX512-NEXT: vmovdqa 96(%rdx), %ymm4 8282; AVX512-NEXT: vpsrldq {{.*#+}} ymm1 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 8283; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 8284; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 8285; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm9[4],ymm4[5],ymm9[5],ymm4[6],ymm9[6],ymm4[7],ymm9[7],ymm4[12],ymm9[12],ymm4[13],ymm9[13],ymm4[14],ymm9[14],ymm4[15],ymm9[15] 8286; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] 8287; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 8288; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 8289; AVX512-NEXT: vmovdqa 96(%rsi), %ymm10 8290; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,1,2,3,6,5,6,7] 8291; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 8292; AVX512-NEXT: vmovdqa 96(%rdi), %ymm6 8293; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,1,2,3,6,5,6,7] 8294; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 8295; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] 8296; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 8297; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm10[4],ymm6[5],ymm10[5],ymm6[6],ymm10[6],ymm6[7],ymm10[7],ymm6[12],ymm10[12],ymm6[13],ymm10[13],ymm6[14],ymm10[14],ymm6[15],ymm10[15] 8298; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] 8299; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 8300; AVX512-NEXT: movw $18724, %ax # imm = 0x4924 8301; AVX512-NEXT: kmovw %eax, %k1 8302; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8303; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8304; AVX512-NEXT: vmovdqa 96(%r8), %ymm3 8305; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] 8306; AVX512-NEXT: vpshufb %ymm14, %ymm3, %ymm5 8307; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] 8308; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7] 8309; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 8310; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] 8311; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] 8312; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8313; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8314; AVX512-NEXT: vmovdqa 64(%rcx), %ymm5 8315; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8316; AVX512-NEXT: vpsrldq {{.*#+}} ymm0 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 8317; AVX512-NEXT: vmovdqa 64(%rdx), %ymm7 8318; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8319; AVX512-NEXT: vpsrldq {{.*#+}} ymm1 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 8320; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 8321; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 8322; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15] 8323; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] 8324; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 8325; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 8326; AVX512-NEXT: vmovdqa 64(%rsi), %ymm7 8327; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7] 8328; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 8329; AVX512-NEXT: vmovdqa 64(%rdi), %ymm2 8330; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8331; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,1,2,3,6,5,6,7] 8332; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 8333; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11] 8334; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 8335; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15] 8336; AVX512-NEXT: vmovdqa64 %ymm7, %ymm20 8337; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] 8338; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 8339; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8340; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8341; AVX512-NEXT: vmovdqa 64(%r8), %ymm12 8342; AVX512-NEXT: vpshufb %ymm14, %ymm12, %ymm5 8343; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] 8344; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7] 8345; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 8346; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] 8347; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] 8348; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8349; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8350; AVX512-NEXT: vmovdqa 32(%rcx), %ymm5 8351; AVX512-NEXT: vpsrldq {{.*#+}} ymm0 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 8352; AVX512-NEXT: vmovdqa 32(%rdx), %ymm7 8353; AVX512-NEXT: vpsrldq {{.*#+}} ymm1 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 8354; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 8355; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 8356; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15] 8357; AVX512-NEXT: vmovdqa64 %ymm7, %ymm29 8358; AVX512-NEXT: vmovdqa64 %ymm5, %ymm18 8359; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] 8360; AVX512-NEXT: vmovdqa 32(%rsi), %ymm8 8361; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,1,2,3,6,5,6,7] 8362; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 8363; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 8364; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[2,1,2,3,6,5,6,7] 8365; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 8366; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] 8367; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 8368; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 8369; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,1,2,3] 8370; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm8[4],ymm2[5],ymm8[5],ymm2[6],ymm8[6],ymm2[7],ymm8[7],ymm2[12],ymm8[12],ymm2[13],ymm8[13],ymm2[14],ymm8[14],ymm2[15],ymm8[15] 8371; AVX512-NEXT: vmovdqa64 %ymm2, %ymm17 8372; AVX512-NEXT: vmovdqa64 %ymm8, %ymm16 8373; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] 8374; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 8375; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8376; AVX512-NEXT: vmovdqa 32(%r8), %ymm15 8377; AVX512-NEXT: vpshufb %ymm14, %ymm15, %ymm0 8378; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 8379; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm5 8380; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7] 8381; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 8382; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] 8383; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] 8384; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8385; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8386; AVX512-NEXT: vmovdqa (%rcx), %ymm8 8387; AVX512-NEXT: vpsrldq {{.*#+}} ymm0 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 8388; AVX512-NEXT: vmovdqa (%rdx), %ymm13 8389; AVX512-NEXT: vpsrldq {{.*#+}} ymm1 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 8390; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 8391; AVX512-NEXT: vpermq {{.*#+}} ymm19 = ymm0[2,2,2,2] 8392; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm8[4],ymm13[5],ymm8[5],ymm13[6],ymm8[6],ymm13[7],ymm8[7],ymm13[12],ymm8[12],ymm13[13],ymm8[13],ymm13[14],ymm8[14],ymm13[15],ymm8[15] 8393; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,3,3,5,6,7,7] 8394; AVX512-NEXT: vmovdqa (%rsi), %ymm2 8395; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,1,2,3,6,5,6,7] 8396; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 8397; AVX512-NEXT: vmovdqa (%rdi), %ymm5 8398; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm5[2,1,2,3,6,5,6,7] 8399; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 8400; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[8],ymm0[8],ymm11[9],ymm0[9],ymm11[10],ymm0[10],ymm11[11],ymm0[11] 8401; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 8402; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] 8403; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[3,3,3,3] 8404; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] 8405; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm19, %zmm7 8406; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 8407; AVX512-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} 8408; AVX512-NEXT: vmovdqa (%r8), %ymm7 8409; AVX512-NEXT: vpshufb %ymm14, %ymm7, %ymm11 8410; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] 8411; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm14 8412; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6],ymm11[7] 8413; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 8414; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] 8415; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm0[1,2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7] 8416; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 8417; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8418; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm8[0],ymm13[1],ymm8[1],ymm13[2],ymm8[2],ymm13[3],ymm8[3],ymm13[8],ymm8[8],ymm13[9],ymm8[9],ymm13[10],ymm8[10],ymm13[11],ymm8[11] 8419; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm23 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] 8420; AVX512-NEXT: vmovdqa (%rcx), %xmm13 8421; AVX512-NEXT: vmovdqa (%rdx), %xmm14 8422; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 8423; AVX512-NEXT: vpermt2d %zmm8, %zmm23, %zmm0 8424; AVX512-NEXT: vmovdqa (%rsi), %xmm1 8425; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8426; AVX512-NEXT: vmovdqa (%rdi), %xmm8 8427; AVX512-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8428; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] 8429; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] 8430; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11] 8431; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] 8432; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 8433; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} 8434; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 8435; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 8436; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm5 8437; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] 8438; AVX512-NEXT: vmovdqa (%r8), %xmm8 8439; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] 8440; AVX512-NEXT: vpshufb %xmm11, %xmm8, %xmm5 8441; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 8442; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7] 8443; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 8444; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8445; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11] 8446; AVX512-NEXT: vmovdqa 96(%rcx), %xmm4 8447; AVX512-NEXT: vmovdqa 96(%rdx), %xmm5 8448; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 8449; AVX512-NEXT: vpermt2d %zmm0, %zmm23, %zmm7 8450; AVX512-NEXT: vmovdqa 96(%rsi), %xmm0 8451; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2 8452; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 8453; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] 8454; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11] 8455; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] 8456; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 8457; AVX512-NEXT: vmovdqa32 %zmm1, %zmm7 {%k1} 8458; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 8459; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 8460; AVX512-NEXT: vextracti64x4 $1, %zmm7, %ymm3 8461; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] 8462; AVX512-NEXT: vmovdqa 96(%r8), %xmm1 8463; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm6 8464; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 8465; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] 8466; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 8467; AVX512-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill 8468; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 8469; AVX512-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 8470; AVX512-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 8471; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 8472; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm31 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] 8473; AVX512-NEXT: vpermt2d %zmm4, %zmm31, %zmm3 8474; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] 8475; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] 8476; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,1,2,1] 8477; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] 8478; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 8479; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 8480; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 8481; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[0,1,0,1] 8482; AVX512-NEXT: movw $9362, %ax # imm = 0x2492 8483; AVX512-NEXT: kmovw %eax, %k2 8484; AVX512-NEXT: vmovdqa32 %zmm3, %zmm0 {%k2} 8485; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,1,3,3,4,5,6,7] 8486; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 8487; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm3 8488; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] 8489; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 8490; AVX512-NEXT: vpbroadcastq %xmm1, %ymm1 8491; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 8492; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm22 8493; AVX512-NEXT: vmovdqa 64(%rcx), %xmm3 8494; AVX512-NEXT: vmovdqa 64(%rdx), %xmm2 8495; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 8496; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 8497; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 8498; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 8499; AVX512-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 8500; AVX512-NEXT: vmovdqa 64(%rsi), %xmm6 8501; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[0,1,2,1] 8502; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,5] 8503; AVX512-NEXT: vmovdqa 64(%rdi), %xmm0 8504; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,2,1] 8505; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] 8506; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 8507; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] 8508; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] 8509; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[0,1,0,1] 8510; AVX512-NEXT: vmovdqa32 %zmm1, %zmm4 {%k2} 8511; AVX512-NEXT: vmovdqa 64(%r8), %xmm1 8512; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] 8513; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] 8514; AVX512-NEXT: vextracti64x4 $1, %zmm4, %ymm7 8515; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] 8516; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 8517; AVX512-NEXT: vpbroadcastq %xmm7, %ymm7 8518; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] 8519; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm28 8520; AVX512-NEXT: vmovdqa 96(%r9), %ymm7 8521; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 8522; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 8523; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm5[2,2,2,3] 8524; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] 8525; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 8526; AVX512-NEXT: vpermq {{.*#+}} ymm25 = ymm5[2,1,2,3] 8527; AVX512-NEXT: vmovdqa 64(%r9), %ymm4 8528; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 8529; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 8530; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm5[2,2,2,3] 8531; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[2,3,2,3,6,7,6,7] 8532; AVX512-NEXT: vmovdqa64 %ymm4, %ymm21 8533; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 8534; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm5[2,1,2,3] 8535; AVX512-NEXT: vmovdqa 32(%r9), %ymm5 8536; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 8537; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 8538; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm9[2,2,2,3] 8539; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 8540; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 8541; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 8542; AVX512-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] 8543; AVX512-NEXT: vpermt2d %zmm2, %zmm23, %zmm3 8544; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] 8545; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] 8546; AVX512-NEXT: vmovdqa64 %ymm20, %ymm6 8547; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 8548; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] 8549; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 8550; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 8551; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 8552; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 8553; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 8554; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm2 8555; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] 8556; AVX512-NEXT: vmovdqa %xmm11, %xmm4 8557; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm1 8558; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 8559; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] 8560; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm20 8561; AVX512-NEXT: vmovdqa 32(%rcx), %xmm0 8562; AVX512-NEXT: vmovdqa 32(%rdx), %xmm1 8563; AVX512-NEXT: vmovdqa64 %ymm29, %ymm2 8564; AVX512-NEXT: vmovdqa64 %ymm18, %ymm3 8565; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] 8566; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 8567; AVX512-NEXT: vpermt2d %zmm2, %zmm23, %zmm9 8568; AVX512-NEXT: vmovdqa 32(%rsi), %xmm2 8569; AVX512-NEXT: vmovdqa 32(%rdi), %xmm3 8570; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 8571; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,1,1,1] 8572; AVX512-NEXT: vmovdqa64 %ymm16, %ymm6 8573; AVX512-NEXT: vmovdqa64 %ymm17, %ymm11 8574; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[8],ymm6[8],ymm11[9],ymm6[9],ymm11[10],ymm6[10],ymm11[11],ymm6[11] 8575; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] 8576; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm11 8577; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[2,3,2,3,6,7,6,7] 8578; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 8579; AVX512-NEXT: vpermq {{.*#+}} ymm17 = ymm10[2,1,2,3] 8580; AVX512-NEXT: vmovdqa32 %zmm11, %zmm9 {%k1} 8581; AVX512-NEXT: vextracti64x4 $1, %zmm9, %ymm11 8582; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 8583; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] 8584; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] 8585; AVX512-NEXT: vmovdqa 32(%r8), %xmm15 8586; AVX512-NEXT: vpshufb %xmm4, %xmm15, %xmm12 8587; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] 8588; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5,6],ymm12[7] 8589; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm19 8590; AVX512-NEXT: vmovdqa (%r9), %ymm11 8591; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 8592; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 8593; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 8594; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 8595; AVX512-NEXT: vpermt2d %zmm0, %zmm31, %zmm12 8596; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,1,2,1] 8597; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] 8598; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,1,2,1] 8599; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] 8600; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 8601; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 8602; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 8603; AVX512-NEXT: vpermq {{.*#+}} ymm18 = ymm0[2,2,2,3] 8604; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 8605; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[2,3,2,3,6,7,6,7] 8606; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 8607; AVX512-NEXT: vpermq {{.*#+}} ymm16 = ymm2[2,1,2,3] 8608; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 8609; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1] 8610; AVX512-NEXT: vmovdqa32 %zmm12, %zmm1 {%k2} 8611; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm3 8612; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[2,1,3,3,4,5,6,7] 8613; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] 8614; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm3[1,2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7] 8615; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero 8616; AVX512-NEXT: vpbroadcastq %xmm12, %ymm12 8617; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7] 8618; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm15 8619; AVX512-NEXT: vmovdqa (%r9), %xmm3 8620; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] 8621; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7] 8622; AVX512-NEXT: vpermq {{.*#+}} ymm23 = ymm1[0,1,0,1] 8623; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 8624; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm1[2,2,2,2] 8625; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] 8626; AVX512-NEXT: vpsrldq {{.*#+}} xmm11 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 8627; AVX512-NEXT: vpsrldq {{.*#+}} xmm12 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 8628; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] 8629; AVX512-NEXT: vpermt2d %zmm11, %zmm31, %zmm1 8630; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8631; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,1,2,1] 8632; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] 8633; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8634; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1] 8635; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] 8636; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] 8637; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 8638; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] 8639; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[0,1,0,1] 8640; AVX512-NEXT: vmovdqa32 %zmm1, %zmm11 {%k2} 8641; AVX512-NEXT: vextracti64x4 $1, %zmm11, %ymm1 8642; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,1,3,3,4,5,6,7] 8643; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] 8644; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1,2],ymm12[3],ymm1[4,5],ymm12[6],ymm1[7] 8645; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero 8646; AVX512-NEXT: vpbroadcastq %xmm8, %ymm8 8647; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7] 8648; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 8649; AVX512-NEXT: vmovdqa 96(%r9), %xmm8 8650; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,3,2,3] 8651; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,1,4,5,6,7] 8652; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] 8653; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 8654; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] 8655; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[0,0,2,1,4,5,6,7] 8656; AVX512-NEXT: vpbroadcastq %xmm12, %ymm12 8657; AVX512-NEXT: vmovdqa 64(%r9), %xmm13 8658; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] 8659; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] 8660; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] 8661; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,0,2,1,4,5,6,7] 8662; AVX512-NEXT: vpbroadcastq %xmm14, %ymm14 8663; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[0,2,2,3,4,5,6,7] 8664; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] 8665; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] 8666; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm24 8667; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 8668; AVX512-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm24 # 64-byte Folded Reload 8669; AVX512-NEXT: # zmm24 = zmm24 ^ (zmm25 & (zmm24 ^ mem)) 8670; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm26, %zmm26 8671; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] 8672; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7] 8673; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] 8674; AVX512-NEXT: vmovdqa 32(%r9), %xmm0 8675; AVX512-NEXT: vmovdqa64 %ymm21, %ymm2 8676; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 8677; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] 8678; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,3,2,3] 8679; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,1,4,5,6,7] 8680; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] 8681; AVX512-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm26 # 64-byte Folded Reload 8682; AVX512-NEXT: # zmm26 = zmm26 ^ (zmm25 & (zmm26 ^ mem)) 8683; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 8684; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] 8685; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,0,2,1,4,5,6,7] 8686; AVX512-NEXT: vpbroadcastq %xmm9, %ymm9 8687; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm30, %zmm17 8688; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 8689; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 8690; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 8691; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] 8692; AVX512-NEXT: vpbroadcastq %xmm2, %ymm2 8693; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] 8694; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 8695; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 8696; AVX512-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm17 # 64-byte Folded Reload 8697; AVX512-NEXT: # zmm17 = zmm17 ^ (zmm25 & (zmm17 ^ mem)) 8698; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm18, %zmm16 8699; AVX512-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm16 # 64-byte Folded Reload 8700; AVX512-NEXT: # zmm16 = zmm16 ^ (zmm25 & (zmm16 ^ mem)) 8701; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm23, %zmm18 8702; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] 8703; AVX512-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm18 # 64-byte Folded Reload 8704; AVX512-NEXT: # zmm18 = zmm18 ^ (zmm23 & (zmm18 ^ mem)) 8705; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4 8706; AVX512-NEXT: vpternlogd $184, (%rsp), %zmm23, %zmm4 # 64-byte Folded Reload 8707; AVX512-NEXT: # zmm4 = zmm4 ^ (zmm23 & (zmm4 ^ mem)) 8708; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7 8709; AVX512-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (zmm23 & (zmm7 ^ zmm20)) 8710; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 8711; AVX512-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (zmm23 & (zmm5 ^ zmm19)) 8712; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm8 8713; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] 8714; AVX512-NEXT: vpternlogd {{.*#+}} zmm8 = zmm8 ^ (zmm10 & (zmm8 ^ zmm22)) 8715; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm6 8716; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (zmm10 & (zmm6 ^ zmm28)) 8717; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 8718; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (zmm10 & (zmm0 ^ zmm15)) 8719; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 8720; AVX512-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (zmm10 & (zmm2 ^ zmm1)) 8721; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 8722; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) 8723; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) 8724; AVX512-NEXT: vmovdqa64 %zmm5, 256(%rax) 8725; AVX512-NEXT: vmovdqa64 %zmm7, 448(%rax) 8726; AVX512-NEXT: vmovdqa64 %zmm6, 384(%rax) 8727; AVX512-NEXT: vmovdqa64 %zmm8, 576(%rax) 8728; AVX512-NEXT: vmovdqa64 %zmm4, 640(%rax) 8729; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rax) 8730; AVX512-NEXT: vmovdqa64 %zmm16, 128(%rax) 8731; AVX512-NEXT: vmovdqa64 %zmm17, 320(%rax) 8732; AVX512-NEXT: vmovdqa64 %zmm26, 512(%rax) 8733; AVX512-NEXT: vmovdqa64 %zmm24, 704(%rax) 8734; AVX512-NEXT: addq $392, %rsp # imm = 0x188 8735; AVX512-NEXT: vzeroupper 8736; AVX512-NEXT: retq 8737; 8738; AVX512-FCP-LABEL: store_i16_stride6_vf64: 8739; AVX512-FCP: # %bb.0: 8740; AVX512-FCP-NEXT: subq $1240, %rsp # imm = 0x4D8 8741; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm1 8742; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm2 8743; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm6 8744; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm5 8745; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm9 8746; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm10 8747; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm7 8748; AVX512-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8749; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm0 8750; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8751; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm13 8752; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm8 8753; AVX512-FCP-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill 8754; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm12 8755; AVX512-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8756; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm11 8757; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm4 8758; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8759; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm3 8760; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8761; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] 8762; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] 8763; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 8764; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8765; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] 8766; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] 8767; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 8768; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8769; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] 8770; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] 8771; AVX512-FCP-NEXT: vmovdqa64 %xmm13, %xmm18 8772; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 8773; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8774; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm3 8775; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm7 8776; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11] 8777; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] 8778; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 8779; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8780; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm12 8781; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8782; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] 8783; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm4 8784; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm0 8785; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8786; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm8 8787; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] 8788; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [2,1,2,3,11,11,11,11] 8789; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm12[4],ymm0[5],ymm12[5],ymm0[6],ymm12[6],ymm0[7],ymm12[7],ymm0[12],ymm12[12],ymm0[13],ymm12[13],ymm0[14],ymm12[14],ymm0[15],ymm12[15] 8790; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 8791; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7] 8792; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] 8793; AVX512-FCP-NEXT: vpermd %ymm12, %ymm4, %ymm12 8794; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm0 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 8795; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 8796; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 8797; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 8798; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 8799; AVX512-FCP-NEXT: movw $18724, %ax # imm = 0x4924 8800; AVX512-FCP-NEXT: kmovw %eax, %k1 8801; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} 8802; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [12,1,2,13,4,5,14,7] 8803; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm0 8804; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm2 8805; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8806; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 8807; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm25, %ymm0 8808; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [8,21,10,11,20,13,14,23] 8809; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] 8810; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm1 8811; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm8 8812; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 8813; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8814; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [2,2,0,3,10,0,10,11] 8815; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm1 8816; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8817; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] 8818; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm0 8819; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] 8820; AVX512-FCP-NEXT: # ymm14 = mem[0,1,0,1] 8821; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm1 8822; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 8823; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8824; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm2 8825; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8826; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm0 8827; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 8828; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8829; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm8, %ymm1 8830; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 8831; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15] 8832; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm0 8833; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] 8834; AVX512-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1 8835; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 8836; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm5 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 8837; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] 8838; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] 8839; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 8840; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} 8841; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 8842; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm2 8843; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8844; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 8845; AVX512-FCP-NEXT: vpermt2d %ymm5, %ymm25, %ymm1 8846; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm5 8847; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm0 8848; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8849; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8850; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm1 8851; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8852; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm0 8853; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm1 8854; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 8855; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8856; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 8857; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8858; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] 8859; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm0 8860; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 8861; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8862; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm1 8863; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 8864; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,2,1,8,9,8,9] 8865; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 8866; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 8867; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [1,0,2,2,1,0,2,2] 8868; AVX512-FCP-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] 8869; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] 8870; AVX512-FCP-NEXT: vpermd %ymm0, %ymm30, %ymm0 8871; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 8872; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm5 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 8873; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 8874; AVX512-FCP-NEXT: vpbroadcastq %xmm2, %ymm2 8875; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 8876; AVX512-FCP-NEXT: movw $9362, %ax # imm = 0x2492 8877; AVX512-FCP-NEXT: kmovw %eax, %k2 8878; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} 8879; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm0 8880; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [16,9,10,17,12,13,18,15] 8881; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm5 8882; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8883; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7] 8884; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm1 8885; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero 8886; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [0,1,8,3,4,9,6,7] 8887; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm26, %ymm0 8888; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 8889; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8890; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,8,8,0,9] 8891; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm1 8892; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8893; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] 8894; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm0 8895; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] 8896; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 8897; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8898; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 8899; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm0 8900; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 8901; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm1 8902; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 8903; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 8904; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm23 8905; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 8906; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 8907; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm2 8908; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] 8909; AVX512-FCP-NEXT: vpermd %ymm0, %ymm30, %ymm0 8910; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 8911; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm5 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 8912; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 8913; AVX512-FCP-NEXT: vpbroadcastq %xmm2, %ymm2 8914; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 8915; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} 8916; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm0 8917; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm5 8918; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7] 8919; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm1 8920; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero 8921; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm19 8922; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm26, %ymm0 8923; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 8924; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8925; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm1 8926; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8927; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm0 8928; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] 8929; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 8930; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8931; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 8932; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm0 8933; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 8934; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm1 8935; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 8936; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] 8937; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm21 8938; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm20 8939; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm0 8940; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15] 8941; AVX512-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1 8942; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 8943; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 8944; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 8945; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 8946; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 8947; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} 8948; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 8949; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm3 8950; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 8951; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm25, %ymm1 8952; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm5 8953; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm2 8954; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm18 8955; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm0 8956; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8957; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8958; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm12 8959; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm0 8960; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm11 8961; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm11, %ymm1 8962; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 8963; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] 8964; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm0 8965; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm3 8966; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm1 8967; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm2 8968; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm22 8969; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm29, %zmm2 8970; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8971; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 8972; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 8973; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] 8974; AVX512-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1 8975; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 8976; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 8977; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] 8978; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] 8979; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 8980; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} 8981; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm10 8982; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 8983; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm25 8984; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm1 8985; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 8986; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm25, %zmm25 8987; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm7 8988; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm0 8989; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm13 8990; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm13 8991; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm15 8992; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm14 8993; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm15, %xmm0 8994; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm14, %xmm1 8995; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 8996; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] 8997; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm4 8998; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8999; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9000; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 9001; AVX512-FCP-NEXT: vpermd %ymm0, %ymm30, %ymm5 9002; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm6 9003; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm3 9004; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm9 9005; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm8 9006; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] 9007; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] 9008; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm16, %zmm0 9009; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 9010; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm9 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 9011; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 9012; AVX512-FCP-NEXT: vpbroadcastq %xmm8, %ymm8 9013; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 9014; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm4 {%k2} 9015; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm5 9016; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm8 9017; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[2,1,3,3,4,5,6,7] 9018; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm17, %zmm4 9019; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero 9020; AVX512-FCP-NEXT: vpermt2d %ymm9, %ymm26, %ymm5 9021; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm16 9022; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9023; AVX512-FCP-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload 9024; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 9025; AVX512-FCP-NEXT: vpermd %ymm4, %ymm30, %ymm4 9026; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 9027; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm9 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 9028; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] 9029; AVX512-FCP-NEXT: vpbroadcastq %xmm5, %ymm5 9030; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 9031; AVX512-FCP-NEXT: vmovdqa32 %zmm4, %zmm0 {%k2} 9032; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm1 9033; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 9034; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm0, %ymm26 9035; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,1,3,3,4,5,6,7] 9036; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm17, %zmm0 9037; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm9 9038; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] 9039; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm4 9040; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7] 9041; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm5 9042; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm26, %zmm17 9043; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm0 9044; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm2 9045; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,0,2,1,4,5,6,7] 9046; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm31, %zmm4 9047; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] 9048; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] 9049; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,1,1,1,10,10,10,11] 9050; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm3 9051; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14] 9052; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm2 # 64-byte Folded Reload 9053; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} 9054; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] 9055; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm3 9056; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [0,9,2,3,8,5,6,11] 9057; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm1 9058; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm27, %ymm1 9059; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 9060; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm30 = [8,9,20,11,12,21,14,15] 9061; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm30, %zmm2 9062; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 9063; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] 9064; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm3 9065; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm31 9066; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,1,0,10,10,0] 9067; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm3 9068; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9069; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload 9070; AVX512-FCP-NEXT: # ymm7 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] 9071; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9072; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 9073; AVX512-FCP-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 9074; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 9075; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm7 # 64-byte Folded Reload 9076; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} 9077; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9078; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 9079; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm29 9080; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm27, %ymm29 9081; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 9082; AVX512-FCP-NEXT: # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 9083; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm7 9084; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9085; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9086; AVX512-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] 9087; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm10 9088; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm6 9089; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] 9090; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 9091; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 64-byte Folded Reload 9092; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm0 {%k1} 9093; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm6 9094; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm10 9095; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 9096; AVX512-FCP-NEXT: vpermt2d %ymm10, %ymm27, %ymm19 9097; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload 9098; AVX512-FCP-NEXT: # ymm10 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 9099; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm30, %zmm0 9100; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm6 9101; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm10 9102; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11] 9103; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] 9104; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 9105; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm10 # 64-byte Folded Reload 9106; AVX512-FCP-NEXT: vmovdqa32 %zmm14, %zmm10 {%k1} 9107; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 9108; AVX512-FCP-NEXT: vpermi2d %ymm8, %ymm10, %ymm27 9109; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm6 9110; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 9111; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm30, %zmm10 9112; AVX512-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload 9113; AVX512-FCP-NEXT: # ymm8 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 9114; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 9115; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm14 9116; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm11 9117; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm28, %zmm11 9118; AVX512-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload 9119; AVX512-FCP-NEXT: # ymm8 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 9120; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 9121; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm12 9122; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm28, %zmm12 9123; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm6 9124; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm8 9125; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 9126; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm28, %zmm6 9127; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm27, %zmm8 9128; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] 9129; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (zmm9 & (zmm6 ^ zmm8)) 9130; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 9131; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 256(%rax) 9132; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 9133; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm12 = zmm12 ^ (zmm9 & (zmm12 ^ zmm0)) 9134; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 448(%rax) 9135; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm29, %zmm0 9136; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (zmm9 & (zmm11 ^ zmm0)) 9137; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 640(%rax) 9138; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm0 9139; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (zmm9 & (zmm3 ^ zmm0)) 9140; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) 9141; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] 9142; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (zmm0 & (zmm4 ^ zmm17)) 9143; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 9144; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm16)) 9145; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax) 9146; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 9147; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm13 = zmm13 ^ (zmm1 & (zmm13 ^ zmm25)) 9148; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 128(%rax) 9149; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 9150; AVX512-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload 9151; AVX512-FCP-NEXT: # zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ mem)) 9152; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 320(%rax) 9153; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 9154; AVX512-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload 9155; AVX512-FCP-NEXT: # zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ mem)) 9156; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 384(%rax) 9157; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 9158; AVX512-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload 9159; AVX512-FCP-NEXT: # zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ mem)) 9160; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 576(%rax) 9161; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9162; AVX512-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 9163; AVX512-FCP-NEXT: # zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ mem)) 9164; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 512(%rax) 9165; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9166; AVX512-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 9167; AVX512-FCP-NEXT: # zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ mem)) 9168; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 704(%rax) 9169; AVX512-FCP-NEXT: addq $1240, %rsp # imm = 0x4D8 9170; AVX512-FCP-NEXT: vzeroupper 9171; AVX512-FCP-NEXT: retq 9172; 9173; AVX512DQ-LABEL: store_i16_stride6_vf64: 9174; AVX512DQ: # %bb.0: 9175; AVX512DQ-NEXT: subq $584, %rsp # imm = 0x248 9176; AVX512DQ-NEXT: vmovdqa 96(%rcx), %xmm0 9177; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 9178; AVX512DQ-NEXT: vmovdqa 96(%rdx), %xmm2 9179; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9180; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 9181; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 9182; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 9183; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 9184; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] 9185; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 9186; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm4 9187; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9188; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] 9189; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] 9190; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm5 9191; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9192; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1] 9193; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] 9194; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 9195; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 9196; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 9197; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,0,1] 9198; AVX512DQ-NEXT: movw $9362, %ax # imm = 0x2492 9199; AVX512DQ-NEXT: kmovw %eax, %k1 9200; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} 9201; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 9202; AVX512DQ-NEXT: vmovdqa 96(%r8), %xmm4 9203; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9204; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] 9205; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 9206; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] 9207; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 9208; AVX512DQ-NEXT: vpbroadcastq %xmm3, %ymm3 9209; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 9210; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 9211; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9212; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm5 9213; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm2 9214; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9215; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm6 9216; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm3 9217; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9218; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 9219; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 9220; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 9221; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 9222; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 9223; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm4 9224; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9225; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm7 9226; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] 9227; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] 9228; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm8 9229; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,2,1] 9230; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] 9231; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 9232; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 9233; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm29 9234; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm30 9235; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 9236; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,0,1] 9237; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} 9238; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 9239; AVX512DQ-NEXT: vmovdqa 64(%r8), %xmm7 9240; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7] 9241; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 9242; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] 9243; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero 9244; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm31 9245; AVX512DQ-NEXT: vpbroadcastq %xmm3, %ymm3 9246; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 9247; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm7 9248; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 9249; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9250; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9251; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 9252; AVX512DQ-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9253; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 9254; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 9255; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 9256; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 9257; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1] 9258; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] 9259; AVX512DQ-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9260; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,1,2,1] 9261; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] 9262; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 9263; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] 9264; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] 9265; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1] 9266; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} 9267; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm4 9268; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9269; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7] 9270; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 9271; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 9272; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] 9273; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 9274; AVX512DQ-NEXT: vpbroadcastq %xmm3, %ymm3 9275; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] 9276; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 9277; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9278; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3 9279; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm4 9280; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 9281; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 9282; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 9283; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 9284; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm27 9285; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm28 9286; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 9287; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm3 9288; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,1,2,1] 9289; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] 9290; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4 9291; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1] 9292; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] 9293; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 9294; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 9295; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm20 9296; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm21 9297; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 9298; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,0,1] 9299; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} 9300; AVX512DQ-NEXT: vmovdqa (%r8), %xmm3 9301; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7] 9302; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 9303; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 9304; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] 9305; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 9306; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm18 9307; AVX512DQ-NEXT: vpbroadcastq %xmm2, %ymm2 9308; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] 9309; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 9310; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9311; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm2 9312; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7] 9313; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 9314; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm3 9315; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7] 9316; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 9317; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 9318; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 9319; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] 9320; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm22 9321; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm26 9322; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] 9323; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm12 9324; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm2 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 9325; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm9 9326; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm3 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 9327; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 9328; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 9329; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[12],ymm12[12],ymm9[13],ymm12[13],ymm9[14],ymm12[14],ymm9[15],ymm12[15] 9330; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] 9331; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] 9332; AVX512DQ-NEXT: movw $18724, %ax # imm = 0x4924 9333; AVX512DQ-NEXT: kmovw %eax, %k1 9334; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 9335; AVX512DQ-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1} 9336; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm6 9337; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] 9338; AVX512DQ-NEXT: vpshufb %ymm13, %ymm6, %ymm1 9339; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 9340; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 9341; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] 9342; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 9343; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 9344; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] 9345; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 9346; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9347; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm2 9348; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7] 9349; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 9350; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm3 9351; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7] 9352; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 9353; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 9354; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 9355; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] 9356; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm16 9357; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm17 9358; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] 9359; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm7 9360; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm2 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 9361; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm5 9362; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 9363; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 9364; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 9365; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15] 9366; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] 9367; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] 9368; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 9369; AVX512DQ-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1} 9370; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm8 9371; AVX512DQ-NEXT: vpshufb %ymm13, %ymm8, %ymm1 9372; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 9373; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 9374; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] 9375; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 9376; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 9377; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] 9378; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 9379; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9380; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm2 9381; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9382; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7] 9383; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 9384; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3 9385; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9386; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7] 9387; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 9388; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 9389; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 9390; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] 9391; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] 9392; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm11 9393; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm2 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 9394; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm10 9395; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 9396; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 9397; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 9398; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] 9399; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] 9400; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] 9401; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 9402; AVX512DQ-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1} 9403; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm3 9404; AVX512DQ-NEXT: vpshufb %ymm13, %ymm3, %ymm1 9405; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] 9406; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 9407; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] 9408; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 9409; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm19 9410; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 9411; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] 9412; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 9413; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9414; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm4 9415; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,1,2,3,6,5,6,7] 9416; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 9417; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 9418; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7] 9419; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] 9420; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 9421; AVX512DQ-NEXT: vpermq {{.*#+}} ymm24 = ymm0[2,1,2,3] 9422; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] 9423; AVX512DQ-NEXT: vpermq {{.*#+}} ymm23 = ymm0[3,3,3,3] 9424; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm2 9425; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 9426; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm0 9427; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm15 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 9428; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[8],ymm1[8],ymm15[9],ymm1[9],ymm15[10],ymm1[10],ymm15[11],ymm1[11] 9429; AVX512DQ-NEXT: vpermq {{.*#+}} ymm25 = ymm1[2,2,2,2] 9430; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 9431; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7] 9432; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] 9433; AVX512DQ-NEXT: vinserti64x4 $1, %ymm23, %zmm24, %zmm1 9434; AVX512DQ-NEXT: vinserti32x8 $1, %ymm15, %zmm25, %zmm1 {%k1} 9435; AVX512DQ-NEXT: vmovdqa (%r8), %ymm15 9436; AVX512DQ-NEXT: vpshufb %ymm13, %ymm15, %ymm14 9437; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] 9438; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm13 9439; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6],ymm14[7] 9440; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 9441; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] 9442; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm1[1,2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7] 9443; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm23 9444; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm1 9445; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm13 9446; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] 9447; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 9448; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] 9449; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm24, %zmm0 9450; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm1 9451; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm2 9452; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 9453; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] 9454; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] 9455; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 9456; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1} 9457; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 9458; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 9459; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 9460; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 9461; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] 9462; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm2 9463; AVX512DQ-NEXT: vpshufb %xmm4, %xmm2, %xmm2 9464; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 9465; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] 9466; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 9467; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9468; AVX512DQ-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 9469; AVX512DQ-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 9470; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[8],ymm12[8],ymm9[9],ymm12[9],ymm9[10],ymm12[10],ymm9[11],ymm12[11] 9471; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 9472; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9473; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 9474; AVX512DQ-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 9475; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] 9476; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm2 9477; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm3 9478; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] 9479; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] 9480; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1} 9481; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 9482; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 9483; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 9484; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] 9485; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9486; AVX512DQ-NEXT: vpshufb %xmm4, %xmm2, %xmm2 9487; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm9 9488; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 9489; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] 9490; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm26 9491; AVX512DQ-NEXT: vmovdqa 96(%r9), %xmm6 9492; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7] 9493; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm0 9494; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7] 9495; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] 9496; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 9497; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9498; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 9499; AVX512DQ-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] 9500; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11] 9501; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm24, %zmm3 9502; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm2 9503; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm4 9504; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 9505; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] 9506; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm4 9507; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm5 9508; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] 9509; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 9510; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] 9511; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm16 # 64-byte Folded Reload 9512; AVX512DQ-NEXT: # zmm16 = zmm16 ^ (zmm28 & (zmm16 ^ mem)) 9513; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,2,2,3] 9514; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm2, %zmm3 {%k1} 9515; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm1 9516; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 9517; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 9518; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 9519; AVX512DQ-NEXT: vmovdqa (%r9), %xmm14 9520; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm0 9521; AVX512DQ-NEXT: vpshufb %xmm9, %xmm0, %xmm2 9522; AVX512DQ-NEXT: vmovdqa %xmm9, %xmm5 9523; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 9524; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] 9525; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm8 9526; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm27 9527; AVX512DQ-NEXT: vmovdqa 64(%r9), %xmm9 9528; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,0,2,1,4,5,6,7] 9529; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1 9530; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7] 9531; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] 9532; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] 9533; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 9534; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7] 9535; AVX512DQ-NEXT: vpbroadcastq %xmm2, %ymm0 9536; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7] 9537; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] 9538; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm2[0,0,2,1] 9539; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[0,0,2,1,4,5,6,7] 9540; AVX512DQ-NEXT: vpbroadcastq %xmm2, %ymm29 9541; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9542; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload 9543; AVX512DQ-NEXT: # xmm3 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] 9544; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7] 9545; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] 9546; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm2[0,0,2,1] 9547; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] 9548; AVX512DQ-NEXT: vmovdqa 96(%r9), %ymm11 9549; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm24, %zmm2 9550; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 9551; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 9552; AVX512DQ-NEXT: vpermq {{.*#+}} ymm24 = ymm3[2,2,2,3] 9553; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[2,3,2,3,6,7,6,7] 9554; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 9555; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm4[2,1,2,3] 9556; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 9557; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm10 # 16-byte Folded Reload 9558; AVX512DQ-NEXT: # xmm10 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] 9559; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm4 9560; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 9561; AVX512DQ-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload 9562; AVX512DQ-NEXT: # ymm7 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] 9563; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 9564; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 9565; AVX512DQ-NEXT: vpermq {{.*#+}} ymm18 = ymm13[2,2,2,3] 9566; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,1,1,1] 9567; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] 9568; AVX512DQ-NEXT: vinserti32x8 $1, %ymm7, %zmm10, %zmm2 {%k1} 9569; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[2,3,2,3,6,7,6,7] 9570; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 9571; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] 9572; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm10 9573; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm3 9574; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 9575; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] 9576; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7] 9577; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm10 9578; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 9579; AVX512DQ-NEXT: vpshufb %xmm5, %xmm3, %xmm3 9580; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 9581; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 9582; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] 9583; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] 9584; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] 9585; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,3,2,3,6,7,6,7] 9586; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 9587; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] 9588; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm1 # 64-byte Folded Reload 9589; AVX512DQ-NEXT: # zmm1 = zmm1 ^ (zmm28 & (zmm1 ^ mem)) 9590; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 9591; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm2, %zmm2 9592; AVX512DQ-NEXT: vmovdqa (%r9), %ymm15 9593; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] 9594; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 9595; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] 9596; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm15[2,3,2,3,6,7,6,7] 9597; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] 9598; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] 9599; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3] 9600; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,1,4,5,6,7] 9601; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] 9602; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 9603; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] 9604; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] 9605; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7] 9606; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 9607; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 9608; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] 9609; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,3,2,3] 9610; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,1,4,5,6,7] 9611; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 9612; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 9613; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] 9614; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] 9615; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,1,4,5,6,7] 9616; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] 9617; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 9618; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] 9619; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm12 # 64-byte Folded Reload 9620; AVX512DQ-NEXT: # zmm12 = zmm12 ^ (zmm28 & (zmm12 ^ mem)) 9621; AVX512DQ-NEXT: vinserti64x4 $1, %ymm30, %zmm29, %zmm17 9622; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm17 # 64-byte Folded Reload 9623; AVX512DQ-NEXT: # zmm17 = zmm17 ^ (zmm28 & (zmm17 ^ mem)) 9624; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm24, %zmm22 9625; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 9626; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm22 # 64-byte Folded Reload 9627; AVX512DQ-NEXT: # zmm22 = zmm22 ^ (zmm24 & (zmm22 ^ mem)) 9628; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7 9629; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm7 # 64-byte Folded Reload 9630; AVX512DQ-NEXT: # zmm7 = zmm7 ^ (zmm24 & (zmm7 ^ mem)) 9631; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 9632; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm3 # 64-byte Folded Reload 9633; AVX512DQ-NEXT: # zmm3 = zmm3 ^ (zmm24 & (zmm3 ^ mem)) 9634; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 9635; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (zmm24 & (zmm0 ^ zmm23)) 9636; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm5 9637; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] 9638; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (zmm13 & (zmm5 ^ zmm25)) 9639; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm6 9640; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (zmm13 & (zmm6 ^ zmm26)) 9641; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 9642; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (zmm13 & (zmm4 ^ zmm27)) 9643; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 9644; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm8 = zmm8 ^ (zmm13 & (zmm8 ^ zmm2)) 9645; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 9646; AVX512DQ-NEXT: vmovdqa64 %zmm8, 256(%rax) 9647; AVX512DQ-NEXT: vmovdqa64 %zmm4, 448(%rax) 9648; AVX512DQ-NEXT: vmovdqa64 %zmm6, 640(%rax) 9649; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rax) 9650; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rax) 9651; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rax) 9652; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rax) 9653; AVX512DQ-NEXT: vmovdqa64 %zmm16, 576(%rax) 9654; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) 9655; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rax) 9656; AVX512DQ-NEXT: vmovdqa64 %zmm7, 512(%rax) 9657; AVX512DQ-NEXT: vmovdqa64 %zmm22, 704(%rax) 9658; AVX512DQ-NEXT: addq $584, %rsp # imm = 0x248 9659; AVX512DQ-NEXT: vzeroupper 9660; AVX512DQ-NEXT: retq 9661; 9662; AVX512DQ-FCP-LABEL: store_i16_stride6_vf64: 9663; AVX512DQ-FCP: # %bb.0: 9664; AVX512DQ-FCP-NEXT: subq $1176, %rsp # imm = 0x498 9665; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm1 9666; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9667; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm0 9668; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9669; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 9670; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 9671; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9672; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm4 9673; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9674; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm6 9675; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9676; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 9677; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9678; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm5 9679; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9680; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 9681; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9682; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 9683; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 9684; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm2 9685; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm7 9686; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11] 9687; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm29 9688; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm31 9689; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm2 9690; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9691; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm7 9692; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9693; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] 9694; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm2 9695; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm1 9696; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9697; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm8 9698; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] 9699; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm30 9700; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] 9701; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm20 9702; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm3 9703; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9704; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm1 9705; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9706; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] 9707; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 9708; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm16 9709; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [1,1,1,1,10,10,10,11] 9710; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm3 9711; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9712; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 9713; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9714; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] 9715; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm4 9716; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9717; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm3 9718; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9719; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 9720; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm3 9721; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14] 9722; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm19, %zmm21 9723; AVX512DQ-FCP-NEXT: movw $18724, %ax # imm = 0x4924 9724; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 9725; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm21 {%k1} 9726; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [0,9,2,3,8,5,6,11] 9727; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm1 9728; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm0 9729; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9730; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] 9731; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 9732; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm24, %ymm1 9733; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9734; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [8,9,20,11,12,21,14,15] 9735; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 9736; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9737; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 9738; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm21 9739; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,1,0,10,10,0] 9740; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm0 9741; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9742; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 9743; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm1 9744; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9745; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] 9746; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 9747; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 9748; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9749; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm9 9750; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 9751; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[8],ymm9[8],ymm5[9],ymm9[9],ymm5[10],ymm9[10],ymm5[11],ymm9[11] 9752; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm0 9753; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm15 9754; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] 9755; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm23 9756; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm4 9757; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm19, %zmm25 9758; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm4, %zmm25 {%k1} 9759; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 9760; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm1 9761; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9762; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm2 9763; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm24, %ymm0 9764; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9765; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm10 9766; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 9767; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm18, %zmm25 9768; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm11 9769; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 9770; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm0 9771; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9772; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 9773; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm0 9774; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9775; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm0 9776; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9777; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 9778; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9779; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm13 9780; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm12 9781; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] 9782; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 9783; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm17, %zmm6 9784; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm19, %zmm28 9785; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm28 {%k1} 9786; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm20 9787; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm0 9788; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9789; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm6 9790; AVX512DQ-FCP-NEXT: vpermt2d %ymm6, %ymm24, %ymm20 9791; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm6 9792; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 9793; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm18, %zmm28 9794; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm1 9795; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9796; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm7 9797; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 9798; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 9799; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 9800; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9801; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm3 9802; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9803; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 9804; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill 9805; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 9806; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9807; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 9808; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9809; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 9810; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 9811; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 9812; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm19, %zmm19 9813; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} 9814; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm0 9815; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9816; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 9817; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm19, %ymm24 9818; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm0 9819; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9820; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] 9821; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 9822; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm0 9823; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9824; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm1 9825; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm0 9826; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9827; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] 9828; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 9829; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9830; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] 9831; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm0 9832; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm1 9833; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 9834; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm9[4],ymm5[5],ymm9[5],ymm5[6],ymm9[6],ymm5[7],ymm9[7],ymm5[12],ymm9[12],ymm5[13],ymm9[13],ymm5[14],ymm9[14],ymm5[15],ymm9[15] 9835; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [2,1,2,3,11,11,11,11] 9836; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm8 9837; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [5,6,5,6,5,6,7,7] 9838; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm2 9839; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 9840; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] 9841; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm26, %ymm0 9842; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 9843; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm5 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 9844; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11] 9845; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 9846; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm1, %zmm8 {%k1} 9847; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [12,1,2,13,4,5,14,7] 9848; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm0 9849; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 9850; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm22, %ymm0 9851; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] 9852; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm1 9853; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm10 9854; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm29 = [8,21,10,11,20,13,14,23] 9855; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm29, %zmm8 9856; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 9857; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9858; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] 9859; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm0 9860; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] 9861; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] 9862; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm1 9863; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [2,2,0,3,10,0,10,11] 9864; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 9865; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm0 9866; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm12, %ymm2 9867; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 9868; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] 9869; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm0 9870; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm4 9871; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 9872; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] 9873; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm26, %ymm2 9874; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 9875; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 9876; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] 9877; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] 9878; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0 {%k1} 9879; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2 9880; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 9881; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm22, %ymm2 9882; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm3 9883; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm29, %zmm0 9884; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 9885; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9886; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm0 9887; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm3 9888; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm12 9889; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 9890; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] 9891; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm4 9892; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm0 9893; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm15, %xmm2 9894; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 9895; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] 9896; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,2,1,8,9,8,9] 9897; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 9898; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [1,0,2,2,1,0,2,2] 9899; AVX512DQ-FCP-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] 9900; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 9901; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 9902; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 9903; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm30, %ymm0 9904; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 9905; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 9906; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 9907; AVX512DQ-FCP-NEXT: vpbroadcastq %xmm4, %ymm4 9908; AVX512DQ-FCP-NEXT: movw $9362, %ax # imm = 0x2492 9909; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 9910; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm2 {%k2} 9911; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9912; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 9913; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] 9914; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm5 9915; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [16,9,10,17,12,13,18,15] 9916; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm2 9917; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,1,8,3,4,9,6,7] 9918; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm0, %ymm5 9919; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 9920; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9921; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] 9922; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 9923; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm2 9924; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7] 9925; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,8,8,0,9] 9926; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm4 9927; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 9928; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm2 9929; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 9930; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm5 9931; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] 9932; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 9933; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm5 9934; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 9935; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 9936; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 9937; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 9938; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 9939; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 9940; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm30, %ymm2 9941; AVX512DQ-FCP-NEXT: vpbroadcastq %xmm6, %ymm6 9942; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm6, %zmm2, %zmm5 {%k2} 9943; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 9944; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 9945; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,1,3,3,4,5,6,7] 9946; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm17, %zmm5 9947; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero 9948; AVX512DQ-FCP-NEXT: vpermt2d %ymm6, %ymm0, %ymm2 9949; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm23 9950; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 9951; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm2 9952; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,1,4,5,6,7] 9953; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 9954; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9955; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm2 9956; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 9957; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm6 9958; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11] 9959; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm9[4],ymm7[5],ymm9[5],ymm7[6],ymm9[6],ymm7[7],ymm9[7],ymm7[12],ymm9[12],ymm7[13],ymm9[13],ymm7[14],ymm9[14],ymm7[15],ymm9[15] 9960; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm6 9961; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 9962; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9963; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm7[4],ymm9[5],ymm7[5],ymm9[6],ymm7[6],ymm9[7],ymm7[7],ymm9[12],ymm7[12],ymm9[13],ymm7[13],ymm9[14],ymm7[14],ymm9[15],ymm7[15] 9964; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm7 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 9965; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm9 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 9966; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[8],ymm7[8],ymm9[9],ymm7[9],ymm9[10],ymm7[10],ymm9[11],ymm7[11] 9967; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm26, %ymm2 9968; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] 9969; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm2, %zmm7, %zmm6 {%k1} 9970; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7 9971; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9972; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 9973; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm22, %ymm7 9974; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13 9975; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm2 9976; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm29, %zmm6 9977; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 9978; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm2 9979; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 9980; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm9 9981; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11] 9982; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] 9983; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 9984; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm27 9985; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 9986; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] 9987; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm7 9988; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm15 9989; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm6 9990; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm31, %zmm6 9991; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9992; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 9993; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] 9994; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm26, %ymm7 9995; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm11 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 9996; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm12 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero 9997; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11] 9998; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] 9999; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm7, %zmm11, %zmm2 {%k1} 10000; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 10001; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] 10002; AVX512DQ-FCP-NEXT: vpermi2d %ymm7, %ymm2, %ymm22 10003; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm9, %ymm7 10004; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm29, %zmm2 10005; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10006; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm11 10007; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm7 10008; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm31, %zmm7 10009; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 10010; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm11 10011; AVX512DQ-FCP-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload 10012; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm12 10013; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] 10014; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] 10015; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm11 10016; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 10017; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 10018; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] 10019; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm30, %ymm12 10020; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm13 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 10021; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm14 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 10022; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] 10023; AVX512DQ-FCP-NEXT: vpbroadcastq %xmm13, %ymm13 10024; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 10025; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm14 10026; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 10027; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm8 10028; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] 10029; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] 10030; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm8 10031; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm13, %zmm12, %zmm11 {%k2} 10032; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 10033; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero 10034; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[2,1,3,3,4,5,6,7] 10035; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm14 10036; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm11 10037; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 10038; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 10039; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] 10040; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm30, %ymm13 10041; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm15 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 10042; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm10 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 10043; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] 10044; AVX512DQ-FCP-NEXT: vpbroadcastq %xmm10, %ymm10 10045; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm10, %zmm13, %zmm8 {%k2} 10046; AVX512DQ-FCP-NEXT: vpermt2d %ymm12, %ymm0, %ymm14 10047; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 10048; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero 10049; AVX512DQ-FCP-NEXT: vpermi2d %ymm10, %ymm8, %ymm0 10050; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[2,1,3,3,4,5,6,7] 10051; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm8 10052; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm22, %zmm2 10053; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm10 10054; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 10055; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] 10056; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm11 10057; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[0,0,2,1,4,5,6,7] 10058; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm16, %zmm12 10059; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 10060; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 10061; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm8 10062; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,0,2,1,4,5,6,7] 10063; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm16, %zmm11 10064; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] 10065; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (zmm8 & (zmm11 ^ zmm0)) 10066; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 10067; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax) 10068; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm12 = zmm12 ^ (zmm8 & (zmm12 ^ zmm10)) 10069; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) 10070; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] 10071; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm2)) 10072; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 128(%rax) 10073; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm27)) 10074; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 320(%rax) 10075; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm24, %zmm2 10076; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] 10077; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 10078; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (zmm6 & (zmm7 ^ zmm2)) 10079; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 256(%rax) 10080; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm2 10081; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 10082; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (zmm6 & (zmm7 ^ zmm2)) 10083; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 448(%rax) 10084; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (zmm8 & (zmm5 ^ zmm23)) 10085; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax) 10086; AVX512DQ-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 64-byte Folded Reload 10087; AVX512DQ-FCP-NEXT: # zmm4 = zmm4 ^ (zmm8 & (zmm4 ^ mem)) 10088; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 576(%rax) 10089; AVX512DQ-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload 10090; AVX512DQ-FCP-NEXT: # zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ mem)) 10091; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 512(%rax) 10092; AVX512DQ-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload 10093; AVX512DQ-FCP-NEXT: # zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ mem)) 10094; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 704(%rax) 10095; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10096; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm0 10097; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 10098; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 ^ (zmm6 & (zmm1 ^ zmm0)) 10099; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 640(%rax) 10100; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 10101; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 10102; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 10103; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 ^ (zmm6 & (zmm1 ^ zmm0)) 10104; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) 10105; AVX512DQ-FCP-NEXT: addq $1176, %rsp # imm = 0x498 10106; AVX512DQ-FCP-NEXT: vzeroupper 10107; AVX512DQ-FCP-NEXT: retq 10108; 10109; AVX512BW-LABEL: store_i16_stride6_vf64: 10110; AVX512BW: # %bb.0: 10111; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 10112; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 10113; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm16 10114; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm24 10115; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 10116; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 10117; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm12 10118; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm15 10119; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] 10120; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 10121; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 10122; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm13 10123; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] 10124; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 10125; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 10126; AVX512BW-NEXT: vpermt2w %zmm16, %zmm26, %zmm2 10127; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] 10128; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 10129; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 10130; AVX512BW-NEXT: vpermt2w %zmm15, %zmm11, %zmm20 10131; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] 10132; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 10133; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 10134; AVX512BW-NEXT: vpermt2w %zmm24, %zmm5, %zmm4 10135; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] 10136; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] 10137; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 10138; AVX512BW-NEXT: vpermt2w %zmm24, %zmm18, %zmm22 10139; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] 10140; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 10141; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 10142; AVX512BW-NEXT: vpermt2w %zmm15, %zmm8, %zmm6 10143; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] 10144; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 10145; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 10146; AVX512BW-NEXT: vpermt2w %zmm15, %zmm21, %zmm23 10147; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] 10148; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 10149; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 10150; AVX512BW-NEXT: vpermt2w %zmm24, %zmm10, %zmm9 10151; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] 10152; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 10153; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 10154; AVX512BW-NEXT: vpermt2w %zmm24, %zmm17, %zmm14 10155; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 10156; AVX512BW-NEXT: vpermt2w %zmm24, %zmm26, %zmm0 10157; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] 10158; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 10159; AVX512BW-NEXT: vpermt2w %zmm24, %zmm26, %zmm25 10160; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] 10161; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] 10162; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm5 10163; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm18 10164; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm10 10165; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm17 10166; AVX512BW-NEXT: vpermt2w %zmm16, %zmm26, %zmm3 10167; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 10168; AVX512BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm26 10169; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 10170; AVX512BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm7 10171; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] 10172; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 10173; AVX512BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm16 10174; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm11 10175; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm8 10176; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm21 10177; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm24 10178; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm1 10179; AVX512BW-NEXT: movw $9362, %ax # imm = 0x2492 10180; AVX512BW-NEXT: kmovd %eax, %k2 10181; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm2 {%k2} 10182; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm12 10183; AVX512BW-NEXT: movw $18724, %ax # imm = 0x4924 10184; AVX512BW-NEXT: kmovd %eax, %k1 10185; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm4 {%k1} 10186; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 10187; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm6 {%k1} 10188; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] 10189; AVX512BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm2 10190; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm9 {%k2} 10191; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] 10192; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm4 10193; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} 10194; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] 10195; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm6 10196; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k1} 10197; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] 10198; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 10199; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} 10200; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] 10201; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm14 10202; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm0 10203; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] 10204; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 10205; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} 10206; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm5 10207; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm8 {%k1} 10208; AVX512BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm8 10209; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} 10210; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm11 10211; AVX512BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm10 10212; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm13 10213; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} 10214; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] 10215; AVX512BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm2 10216; AVX512BW-NEXT: vpermt2w %zmm12, %zmm7, %zmm17 10217; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] 10218; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm4 10219; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} 10220; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] 10221; AVX512BW-NEXT: vpermt2w %zmm13, %zmm3, %zmm6 10222; AVX512BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm1 10223; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] 10224; AVX512BW-NEXT: vpermt2w %zmm13, %zmm12, %zmm9 10225; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] 10226; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 10227; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 10228; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] 10229; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm16 10230; AVX512BW-NEXT: vpermt2w %zmm11, %zmm7, %zmm5 10231; AVX512BW-NEXT: vpermt2w %zmm11, %zmm3, %zmm8 10232; AVX512BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm10 10233; AVX512BW-NEXT: vpermt2w %zmm11, %zmm15, %zmm17 10234; AVX512BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm1 10235; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 10236; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) 10237; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rax) 10238; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) 10239; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rax) 10240; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rax) 10241; AVX512BW-NEXT: vmovdqa64 %zmm0, 384(%rax) 10242; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%rax) 10243; AVX512BW-NEXT: vmovdqa64 %zmm14, 512(%rax) 10244; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rax) 10245; AVX512BW-NEXT: vmovdqa64 %zmm6, 640(%rax) 10246; AVX512BW-NEXT: vmovdqa64 %zmm4, 704(%rax) 10247; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) 10248; AVX512BW-NEXT: vzeroupper 10249; AVX512BW-NEXT: retq 10250; 10251; AVX512BW-FCP-LABEL: store_i16_stride6_vf64: 10252; AVX512BW-FCP: # %bb.0: 10253; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 10254; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 10255; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm16 10256; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 10257; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm1 10258; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 10259; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm12 10260; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm15 10261; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] 10262; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 10263; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 10264; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm13 10265; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] 10266; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 10267; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 10268; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm26, %zmm2 10269; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] 10270; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 10271; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 10272; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm11, %zmm20 10273; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] 10274; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 10275; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 10276; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm5, %zmm4 10277; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] 10278; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] 10279; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 10280; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm18, %zmm22 10281; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] 10282; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 10283; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 10284; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm8, %zmm6 10285; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] 10286; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 10287; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 10288; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm21, %zmm23 10289; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] 10290; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 10291; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 10292; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm10, %zmm9 10293; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] 10294; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 10295; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 10296; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm17, %zmm14 10297; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 10298; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm26, %zmm0 10299; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] 10300; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 10301; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm26, %zmm25 10302; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] 10303; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] 10304; AVX512BW-FCP-NEXT: vpermi2w %zmm16, %zmm3, %zmm5 10305; AVX512BW-FCP-NEXT: vpermi2w %zmm16, %zmm3, %zmm18 10306; AVX512BW-FCP-NEXT: vpermi2w %zmm16, %zmm3, %zmm10 10307; AVX512BW-FCP-NEXT: vpermi2w %zmm16, %zmm3, %zmm17 10308; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm26, %zmm3 10309; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 10310; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm24, %zmm26 10311; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 10312; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm19, %zmm7 10313; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] 10314; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 10315; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm19, %zmm16 10316; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm1, %zmm11 10317; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm1, %zmm8 10318; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm1, %zmm21 10319; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm1, %zmm24 10320; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm1 10321; AVX512BW-FCP-NEXT: movw $9362, %ax # imm = 0x2492 10322; AVX512BW-FCP-NEXT: kmovd %eax, %k2 10323; AVX512BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm2 {%k2} 10324; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm12 10325; AVX512BW-FCP-NEXT: movw $18724, %ax # imm = 0x4924 10326; AVX512BW-FCP-NEXT: kmovd %eax, %k1 10327; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm4 {%k1} 10328; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 10329; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm6 {%k1} 10330; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] 10331; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm15, %zmm2 10332; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm9 {%k2} 10333; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] 10334; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm4 10335; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} 10336; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] 10337; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm6 10338; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm16 {%k1} 10339; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] 10340; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 10341; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} 10342; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] 10343; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm14 10344; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm0 10345; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] 10346; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 10347; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} 10348; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm5 10349; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm8 {%k1} 10350; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm20, %zmm8 10351; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} 10352; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm11 10353; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm22, %zmm10 10354; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 10355; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} 10356; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] 10357; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm18, %zmm2 10358; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm7, %zmm17 10359; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] 10360; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm4 10361; AVX512BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} 10362; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] 10363; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm3, %zmm6 10364; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm15, %zmm1 10365; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] 10366; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm12, %zmm9 10367; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] 10368; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 10369; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 10370; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] 10371; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm16 10372; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm7, %zmm5 10373; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm3, %zmm8 10374; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm12, %zmm10 10375; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm15, %zmm17 10376; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm18, %zmm1 10377; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 10378; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) 10379; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 128(%rax) 10380; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) 10381; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) 10382; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) 10383; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 384(%rax) 10384; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) 10385; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 512(%rax) 10386; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) 10387; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 640(%rax) 10388; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 704(%rax) 10389; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) 10390; AVX512BW-FCP-NEXT: vzeroupper 10391; AVX512BW-FCP-NEXT: retq 10392; 10393; AVX512DQ-BW-LABEL: store_i16_stride6_vf64: 10394; AVX512DQ-BW: # %bb.0: 10395; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3 10396; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm0 10397; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm16 10398; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm24 10399; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm1 10400; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm7 10401; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm12 10402; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm15 10403; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] 10404; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 10405; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 10406; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm13 10407; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] 10408; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 10409; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 10410; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm26, %zmm2 10411; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] 10412; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 10413; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm20 10414; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm11, %zmm20 10415; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] 10416; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 10417; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 10418; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm5, %zmm4 10419; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] 10420; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] 10421; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 10422; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm18, %zmm22 10423; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] 10424; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 10425; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 10426; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm8, %zmm6 10427; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] 10428; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 10429; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm23 10430; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm21, %zmm23 10431; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] 10432; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 10433; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 10434; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm10, %zmm9 10435; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] 10436; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 10437; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 10438; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm17, %zmm14 10439; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 10440; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm26, %zmm0 10441; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] 10442; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 10443; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm26, %zmm25 10444; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] 10445; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] 10446; AVX512DQ-BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm5 10447; AVX512DQ-BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm18 10448; AVX512DQ-BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm10 10449; AVX512DQ-BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm17 10450; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm26, %zmm3 10451; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm26 10452; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm26 10453; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm16 10454; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm7 10455; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] 10456; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 10457; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm16 10458; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm11 10459; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm8 10460; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm21 10461; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm24 10462; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm1 10463; AVX512DQ-BW-NEXT: movw $9362, %ax # imm = 0x2492 10464; AVX512DQ-BW-NEXT: kmovd %eax, %k2 10465; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm2 {%k2} 10466; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm12 10467; AVX512DQ-BW-NEXT: movw $18724, %ax # imm = 0x4924 10468; AVX512DQ-BW-NEXT: kmovd %eax, %k1 10469; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm4 {%k1} 10470; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm13 10471; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm6 {%k1} 10472; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] 10473; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm2 10474; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm9 {%k2} 10475; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] 10476; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm4 10477; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} 10478; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] 10479; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm6 10480; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k1} 10481; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] 10482; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 10483; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} 10484; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] 10485; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm14 10486; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm0 10487; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] 10488; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 10489; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} 10490; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm5 10491; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm8 {%k1} 10492; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm8 10493; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} 10494; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm11 10495; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm10 10496; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm13 10497; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} 10498; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] 10499; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm2 10500; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm7, %zmm17 10501; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] 10502; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm4 10503; AVX512DQ-BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} 10504; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] 10505; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm3, %zmm6 10506; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm1 10507; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] 10508; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm12, %zmm9 10509; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] 10510; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 10511; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 10512; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] 10513; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm16 10514; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm7, %zmm5 10515; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm3, %zmm8 10516; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm10 10517; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm15, %zmm17 10518; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm1 10519; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 10520; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%rax) 10521; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 128(%rax) 10522; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 192(%rax) 10523; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 256(%rax) 10524; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 320(%rax) 10525; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 384(%rax) 10526; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 448(%rax) 10527; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 512(%rax) 10528; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 576(%rax) 10529; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 640(%rax) 10530; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 704(%rax) 10531; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) 10532; AVX512DQ-BW-NEXT: vzeroupper 10533; AVX512DQ-BW-NEXT: retq 10534; 10535; AVX512DQ-BW-FCP-LABEL: store_i16_stride6_vf64: 10536; AVX512DQ-BW-FCP: # %bb.0: 10537; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 10538; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 10539; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm16 10540; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 10541; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm1 10542; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 10543; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm12 10544; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm15 10545; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] 10546; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 10547; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 10548; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm13 10549; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] 10550; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 10551; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 10552; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm26, %zmm2 10553; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] 10554; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 10555; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 10556; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm11, %zmm20 10557; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] 10558; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 10559; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 10560; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm5, %zmm4 10561; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] 10562; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] 10563; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 10564; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm18, %zmm22 10565; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] 10566; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 10567; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 10568; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm8, %zmm6 10569; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] 10570; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 10571; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 10572; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm21, %zmm23 10573; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] 10574; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 10575; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 10576; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm10, %zmm9 10577; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] 10578; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] 10579; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 10580; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm17, %zmm14 10581; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 10582; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm26, %zmm0 10583; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] 10584; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] 10585; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm26, %zmm25 10586; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] 10587; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] 10588; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm16, %zmm3, %zmm5 10589; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm16, %zmm3, %zmm18 10590; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm16, %zmm3, %zmm10 10591; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm16, %zmm3, %zmm17 10592; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm26, %zmm3 10593; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 10594; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm24, %zmm26 10595; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 10596; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm19, %zmm7 10597; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] 10598; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] 10599; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm19, %zmm16 10600; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm1, %zmm11 10601; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm1, %zmm8 10602; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm1, %zmm21 10603; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm1, %zmm24 10604; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm1 10605; AVX512DQ-BW-FCP-NEXT: movw $9362, %ax # imm = 0x2492 10606; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 10607; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm2 {%k2} 10608; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm12 10609; AVX512DQ-BW-FCP-NEXT: movw $18724, %ax # imm = 0x4924 10610; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 10611; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm4 {%k1} 10612; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 10613; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm6 {%k1} 10614; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] 10615; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm15, %zmm2 10616; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm9 {%k2} 10617; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] 10618; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm4 10619; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} 10620; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] 10621; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm6 10622; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm16 {%k1} 10623; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] 10624; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 10625; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} 10626; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] 10627; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm14 10628; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm0 10629; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] 10630; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 10631; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} 10632; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm5 10633; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm8 {%k1} 10634; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm20, %zmm8 10635; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} 10636; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm11 10637; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm22, %zmm10 10638; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 10639; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} 10640; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] 10641; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm18, %zmm2 10642; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm7, %zmm17 10643; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] 10644; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm4 10645; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} 10646; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] 10647; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm3, %zmm6 10648; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm15, %zmm1 10649; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] 10650; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm12, %zmm9 10651; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] 10652; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 10653; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 10654; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] 10655; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm16 10656; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm7, %zmm5 10657; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm3, %zmm8 10658; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm12, %zmm10 10659; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm15, %zmm17 10660; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm18, %zmm1 10661; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 10662; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) 10663; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 128(%rax) 10664; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) 10665; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) 10666; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) 10667; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 384(%rax) 10668; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) 10669; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 512(%rax) 10670; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) 10671; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 640(%rax) 10672; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 704(%rax) 10673; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) 10674; AVX512DQ-BW-FCP-NEXT: vzeroupper 10675; AVX512DQ-BW-FCP-NEXT: retq 10676 %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64 10677 %in.vec1 = load <64 x i16>, ptr %in.vecptr1, align 64 10678 %in.vec2 = load <64 x i16>, ptr %in.vecptr2, align 64 10679 %in.vec3 = load <64 x i16>, ptr %in.vecptr3, align 64 10680 %in.vec4 = load <64 x i16>, ptr %in.vecptr4, align 64 10681 %in.vec5 = load <64 x i16>, ptr %in.vecptr5, align 64 10682 %1 = shufflevector <64 x i16> %in.vec0, <64 x i16> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 10683 %2 = shufflevector <64 x i16> %in.vec2, <64 x i16> %in.vec3, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 10684 %3 = shufflevector <64 x i16> %in.vec4, <64 x i16> %in.vec5, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 10685 %4 = shufflevector <128 x i16> %1, <128 x i16> %2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> 10686 %5 = shufflevector <128 x i16> %3, <128 x i16> poison, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 10687 %6 = shufflevector <256 x i16> %4, <256 x i16> %5, <384 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255, i32 256, i32 257, i32 258, i32 259, i32 260, i32 261, i32 262, i32 263, i32 264, i32 265, i32 266, i32 267, i32 268, i32 269, i32 270, i32 271, i32 272, i32 273, i32 274, i32 275, i32 276, i32 277, i32 278, i32 279, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 304, i32 305, i32 306, i32 307, i32 308, i32 309, i32 310, i32 311, i32 312, i32 313, i32 314, i32 315, i32 316, i32 317, i32 318, i32 319, i32 320, i32 321, i32 322, i32 323, i32 324, i32 325, i32 326, i32 327, i32 328, i32 329, i32 330, i32 331, i32 332, i32 333, i32 334, i32 335, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 359, i32 360, i32 361, i32 362, i32 363, i32 364, i32 365, i32 366, i32 367, i32 368, i32 369, i32 370, i32 371, i32 372, i32 373, i32 374, i32 375, i32 376, i32 377, i32 378, i32 379, i32 380, i32 381, i32 382, i32 383> 10688 %interleaved.vec = shufflevector <384 x i16> %6, <384 x i16> poison, <384 x i32> <i32 0, i32 64, i32 128, i32 192, i32 256, i32 320, i32 1, i32 65, i32 129, i32 193, i32 257, i32 321, i32 2, i32 66, i32 130, i32 194, i32 258, i32 322, i32 3, i32 67, i32 131, i32 195, i32 259, i32 323, i32 4, i32 68, i32 132, i32 196, i32 260, i32 324, i32 5, i32 69, i32 133, i32 197, i32 261, i32 325, i32 6, i32 70, i32 134, i32 198, i32 262, i32 326, i32 7, i32 71, i32 135, i32 199, i32 263, i32 327, i32 8, i32 72, i32 136, i32 200, i32 264, i32 328, i32 9, i32 73, i32 137, i32 201, i32 265, i32 329, i32 10, i32 74, i32 138, i32 202, i32 266, i32 330, i32 11, i32 75, i32 139, i32 203, i32 267, i32 331, i32 12, i32 76, i32 140, i32 204, i32 268, i32 332, i32 13, i32 77, i32 141, i32 205, i32 269, i32 333, i32 14, i32 78, i32 142, i32 206, i32 270, i32 334, i32 15, i32 79, i32 143, i32 207, i32 271, i32 335, i32 16, i32 80, i32 144, i32 208, i32 272, i32 336, i32 17, i32 81, i32 145, i32 209, i32 273, i32 337, i32 18, i32 82, i32 146, i32 210, i32 274, i32 338, i32 19, i32 83, i32 147, i32 211, i32 275, i32 339, i32 20, i32 84, i32 148, i32 212, i32 276, i32 340, i32 21, i32 85, i32 149, i32 213, i32 277, i32 341, i32 22, i32 86, i32 150, i32 214, i32 278, i32 342, i32 23, i32 87, i32 151, i32 215, i32 279, i32 343, i32 24, i32 88, i32 152, i32 216, i32 280, i32 344, i32 25, i32 89, i32 153, i32 217, i32 281, i32 345, i32 26, i32 90, i32 154, i32 218, i32 282, i32 346, i32 27, i32 91, i32 155, i32 219, i32 283, i32 347, i32 28, i32 92, i32 156, i32 220, i32 284, i32 348, i32 29, i32 93, i32 157, i32 221, i32 285, i32 349, i32 30, i32 94, i32 158, i32 222, i32 286, i32 350, i32 31, i32 95, i32 159, i32 223, i32 287, i32 351, i32 32, i32 96, i32 160, i32 224, i32 288, i32 352, i32 33, i32 97, i32 161, i32 225, i32 289, i32 353, i32 34, i32 98, i32 162, i32 226, i32 290, i32 354, i32 35, i32 99, i32 163, i32 227, i32 291, i32 355, i32 36, i32 100, i32 164, i32 228, i32 292, i32 356, i32 37, i32 101, i32 165, i32 229, i32 293, i32 357, i32 38, i32 102, i32 166, i32 230, i32 294, i32 358, i32 39, i32 103, i32 167, i32 231, i32 295, i32 359, i32 40, i32 104, i32 168, i32 232, i32 296, i32 360, i32 41, i32 105, i32 169, i32 233, i32 297, i32 361, i32 42, i32 106, i32 170, i32 234, i32 298, i32 362, i32 43, i32 107, i32 171, i32 235, i32 299, i32 363, i32 44, i32 108, i32 172, i32 236, i32 300, i32 364, i32 45, i32 109, i32 173, i32 237, i32 301, i32 365, i32 46, i32 110, i32 174, i32 238, i32 302, i32 366, i32 47, i32 111, i32 175, i32 239, i32 303, i32 367, i32 48, i32 112, i32 176, i32 240, i32 304, i32 368, i32 49, i32 113, i32 177, i32 241, i32 305, i32 369, i32 50, i32 114, i32 178, i32 242, i32 306, i32 370, i32 51, i32 115, i32 179, i32 243, i32 307, i32 371, i32 52, i32 116, i32 180, i32 244, i32 308, i32 372, i32 53, i32 117, i32 181, i32 245, i32 309, i32 373, i32 54, i32 118, i32 182, i32 246, i32 310, i32 374, i32 55, i32 119, i32 183, i32 247, i32 311, i32 375, i32 56, i32 120, i32 184, i32 248, i32 312, i32 376, i32 57, i32 121, i32 185, i32 249, i32 313, i32 377, i32 58, i32 122, i32 186, i32 250, i32 314, i32 378, i32 59, i32 123, i32 187, i32 251, i32 315, i32 379, i32 60, i32 124, i32 188, i32 252, i32 316, i32 380, i32 61, i32 125, i32 189, i32 253, i32 317, i32 381, i32 62, i32 126, i32 190, i32 254, i32 318, i32 382, i32 63, i32 127, i32 191, i32 255, i32 319, i32 383> 10689 store <384 x i16> %interleaved.vec, ptr %out.vec, align 64 10690 ret void 10691} 10692