1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved stores. 17 18define void @store_i8_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { 19; SSE-LABEL: store_i8_stride8_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 22; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 23; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 24; SSE-NEXT: movdqa (%rdi), %xmm0 25; SSE-NEXT: movdqa (%rdx), %xmm1 26; SSE-NEXT: movdqa (%r8), %xmm2 27; SSE-NEXT: movdqa (%r11), %xmm3 28; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 29; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 30; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 31; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 32; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] 33; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 34; SSE-NEXT: pxor %xmm1, %xmm1 35; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 36; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] 37; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 38; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] 39; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] 40; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] 41; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] 42; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 43; SSE-NEXT: packuswb %xmm2, %xmm3 44; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] 45; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 46; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] 47; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 48; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 49; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 50; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] 51; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 52; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 53; SSE-NEXT: packuswb %xmm0, %xmm1 54; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 55; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 56; SSE-NEXT: movdqa %xmm0, (%rax) 57; SSE-NEXT: retq 58; 59; AVX-LABEL: store_i8_stride8_vf2: 60; AVX: # %bb.0: 61; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 62; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 63; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11 64; AVX-NEXT: vmovdqa (%rdi), %xmm0 65; AVX-NEXT: vmovdqa (%rdx), %xmm1 66; AVX-NEXT: vmovdqa (%r8), %xmm2 67; AVX-NEXT: vmovdqa (%r11), %xmm3 68; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 69; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 70; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 71; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 72; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] 73; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 74; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 75; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] 76; AVX-NEXT: vmovdqa %xmm0, (%rax) 77; AVX-NEXT: retq 78; 79; AVX2-LABEL: store_i8_stride8_vf2: 80; AVX2: # %bb.0: 81; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 82; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 83; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 84; AVX2-NEXT: vmovdqa (%rdi), %xmm0 85; AVX2-NEXT: vmovdqa (%rdx), %xmm1 86; AVX2-NEXT: vmovdqa (%r8), %xmm2 87; AVX2-NEXT: vmovdqa (%r11), %xmm3 88; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 89; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 90; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 91; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 92; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] 93; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 94; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 95; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] 96; AVX2-NEXT: vmovdqa %xmm0, (%rax) 97; AVX2-NEXT: retq 98; 99; AVX2-FP-LABEL: store_i8_stride8_vf2: 100; AVX2-FP: # %bb.0: 101; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 102; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 103; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11 104; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 105; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 106; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 107; AVX2-FP-NEXT: vmovdqa (%r11), %xmm3 108; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 109; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 110; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 111; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 112; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] 113; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 114; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 115; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] 116; AVX2-FP-NEXT: vmovdqa %xmm0, (%rax) 117; AVX2-FP-NEXT: retq 118; 119; AVX2-FCP-LABEL: store_i8_stride8_vf2: 120; AVX2-FCP: # %bb.0: 121; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 122; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 123; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 124; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 125; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 126; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 127; AVX2-FCP-NEXT: vmovdqa (%r11), %xmm3 128; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 129; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 130; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 131; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 132; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] 133; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 134; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 135; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] 136; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax) 137; AVX2-FCP-NEXT: retq 138; 139; AVX512-LABEL: store_i8_stride8_vf2: 140; AVX512: # %bb.0: 141; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 142; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 143; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 144; AVX512-NEXT: vmovdqa (%rdi), %xmm0 145; AVX512-NEXT: vmovdqa (%rdx), %xmm1 146; AVX512-NEXT: vmovdqa (%r8), %xmm2 147; AVX512-NEXT: vmovdqa (%r11), %xmm3 148; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 149; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 150; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 151; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 152; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] 153; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 154; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 155; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] 156; AVX512-NEXT: vmovdqa %xmm0, (%rax) 157; AVX512-NEXT: retq 158; 159; AVX512-FCP-LABEL: store_i8_stride8_vf2: 160; AVX512-FCP: # %bb.0: 161; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 162; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 163; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 164; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 165; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 166; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 167; AVX512-FCP-NEXT: vmovdqa (%r11), %xmm3 168; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 169; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 170; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 171; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 172; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] 173; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 174; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 175; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] 176; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax) 177; AVX512-FCP-NEXT: retq 178; 179; AVX512DQ-LABEL: store_i8_stride8_vf2: 180; AVX512DQ: # %bb.0: 181; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 182; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 183; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 184; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 185; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 186; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 187; AVX512DQ-NEXT: vmovdqa (%r11), %xmm3 188; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 189; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 190; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 191; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 192; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] 193; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 194; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 195; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] 196; AVX512DQ-NEXT: vmovdqa %xmm0, (%rax) 197; AVX512DQ-NEXT: retq 198; 199; AVX512DQ-FCP-LABEL: store_i8_stride8_vf2: 200; AVX512DQ-FCP: # %bb.0: 201; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 202; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 203; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 204; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 205; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 206; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 207; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %xmm3 208; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 209; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 210; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 211; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 212; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] 213; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 214; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 215; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] 216; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax) 217; AVX512DQ-FCP-NEXT: retq 218; 219; AVX512BW-LABEL: store_i8_stride8_vf2: 220; AVX512BW: # %bb.0: 221; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 222; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 223; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 224; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 225; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 226; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 227; AVX512BW-NEXT: vmovdqa (%r11), %xmm3 228; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 229; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 230; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 231; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 232; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] 233; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 234; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 235; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] 236; AVX512BW-NEXT: vmovdqa %xmm0, (%rax) 237; AVX512BW-NEXT: retq 238; 239; AVX512BW-FCP-LABEL: store_i8_stride8_vf2: 240; AVX512BW-FCP: # %bb.0: 241; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 242; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 243; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 244; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 245; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 246; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 247; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3 248; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 249; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 250; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 251; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 252; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] 253; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 254; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 255; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] 256; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rax) 257; AVX512BW-FCP-NEXT: retq 258; 259; AVX512DQ-BW-LABEL: store_i8_stride8_vf2: 260; AVX512DQ-BW: # %bb.0: 261; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 262; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 263; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 264; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 265; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 266; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 267; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm3 268; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 269; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 270; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 271; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 272; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] 273; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 274; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 275; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] 276; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rax) 277; AVX512DQ-BW-NEXT: retq 278; 279; AVX512DQ-BW-FCP-LABEL: store_i8_stride8_vf2: 280; AVX512DQ-BW-FCP: # %bb.0: 281; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 282; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 283; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 284; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 285; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 286; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 287; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3 288; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 289; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 290; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 291; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 292; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] 293; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 294; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 295; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] 296; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rax) 297; AVX512DQ-BW-FCP-NEXT: retq 298 %in.vec0 = load <2 x i8>, ptr %in.vecptr0, align 64 299 %in.vec1 = load <2 x i8>, ptr %in.vecptr1, align 64 300 %in.vec2 = load <2 x i8>, ptr %in.vecptr2, align 64 301 %in.vec3 = load <2 x i8>, ptr %in.vecptr3, align 64 302 %in.vec4 = load <2 x i8>, ptr %in.vecptr4, align 64 303 %in.vec5 = load <2 x i8>, ptr %in.vecptr5, align 64 304 %in.vec6 = load <2 x i8>, ptr %in.vecptr6, align 64 305 %in.vec7 = load <2 x i8>, ptr %in.vecptr7, align 64 306 %1 = shufflevector <2 x i8> %in.vec0, <2 x i8> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 307 %2 = shufflevector <2 x i8> %in.vec2, <2 x i8> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 308 %3 = shufflevector <2 x i8> %in.vec4, <2 x i8> %in.vec5, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 309 %4 = shufflevector <2 x i8> %in.vec6, <2 x i8> %in.vec7, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 310 %5 = shufflevector <4 x i8> %1, <4 x i8> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 311 %6 = shufflevector <4 x i8> %3, <4 x i8> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 312 %7 = shufflevector <8 x i8> %5, <8 x i8> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 313 %interleaved.vec = shufflevector <16 x i8> %7, <16 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 314 store <16 x i8> %interleaved.vec, ptr %out.vec, align 64 315 ret void 316} 317 318define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { 319; SSE-LABEL: store_i8_stride8_vf4: 320; SSE: # %bb.0: 321; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 322; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 323; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 324; SSE-NEXT: movdqa (%rdi), %xmm0 325; SSE-NEXT: movdqa (%rdx), %xmm1 326; SSE-NEXT: movdqa (%r8), %xmm2 327; SSE-NEXT: movdqa (%r11), %xmm3 328; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 329; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 330; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] 331; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] 332; SSE-NEXT: pxor %xmm6, %xmm6 333; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] 334; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] 335; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,7,5] 336; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,5,6,4] 337; SSE-NEXT: packuswb %xmm5, %xmm7 338; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,65535,65535,0] 339; SSE-NEXT: movdqa %xmm4, %xmm5 340; SSE-NEXT: pandn %xmm7, %xmm5 341; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 342; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,1,2,0] 343; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,7,5,6,7] 344; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] 345; SSE-NEXT: packuswb %xmm8, %xmm7 346; SSE-NEXT: pand %xmm4, %xmm7 347; SSE-NEXT: por %xmm5, %xmm7 348; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] 349; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] 350; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] 351; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,1,1,3,4,5,6,7] 352; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,1,0,2,4,5,6,7] 353; SSE-NEXT: packuswb %xmm8, %xmm9 354; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,0,65535,65535] 355; SSE-NEXT: movdqa %xmm5, %xmm8 356; SSE-NEXT: pandn %xmm9, %xmm8 357; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] 358; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] 359; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[1,3,2,3,4,5,6,7] 360; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] 361; SSE-NEXT: packuswb %xmm9, %xmm6 362; SSE-NEXT: pand %xmm5, %xmm6 363; SSE-NEXT: por %xmm8, %xmm6 364; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 365; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 366; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] 367; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5,5,7] 368; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] 369; SSE-NEXT: packuswb %xmm7, %xmm3 370; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] 371; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,5,7,6,7] 372; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 373; SSE-NEXT: packuswb %xmm7, %xmm2 374; SSE-NEXT: pand %xmm4, %xmm2 375; SSE-NEXT: pandn %xmm3, %xmm4 376; SSE-NEXT: por %xmm2, %xmm4 377; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] 378; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 379; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,3,1,4,5,6,7] 380; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] 381; SSE-NEXT: packuswb %xmm3, %xmm1 382; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 383; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,1,2,3,4,5,6,7] 384; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 385; SSE-NEXT: packuswb %xmm3, %xmm0 386; SSE-NEXT: pand %xmm5, %xmm0 387; SSE-NEXT: pandn %xmm1, %xmm5 388; SSE-NEXT: por %xmm0, %xmm5 389; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] 390; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 391; SSE-NEXT: movdqa %xmm0, 16(%rax) 392; SSE-NEXT: movdqa %xmm6, (%rax) 393; SSE-NEXT: retq 394; 395; AVX-LABEL: store_i8_stride8_vf4: 396; AVX: # %bb.0: 397; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 398; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 399; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11 400; AVX-NEXT: vmovdqa (%rdi), %xmm0 401; AVX-NEXT: vmovdqa (%rdx), %xmm1 402; AVX-NEXT: vmovdqa (%r8), %xmm2 403; AVX-NEXT: vmovdqa (%r11), %xmm3 404; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 405; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 406; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 407; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] 408; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1] 409; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 410; AVX-NEXT: vmovq {{.*#+}} xmm2 = [2,6,10,14,3,7,11,15,0,0,0,0,0,0,0,0] 411; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm3 412; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2 413; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 414; AVX-NEXT: vmovq {{.*#+}} xmm3 = [0,4,8,12,1,5,9,13,0,0,0,0,0,0,0,0] 415; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 416; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 417; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 418; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 419; AVX-NEXT: vmovaps %ymm0, (%rax) 420; AVX-NEXT: vzeroupper 421; AVX-NEXT: retq 422; 423; AVX2-LABEL: store_i8_stride8_vf4: 424; AVX2: # %bb.0: 425; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 426; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 427; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 428; AVX2-NEXT: vmovdqa (%rdi), %xmm0 429; AVX2-NEXT: vmovdqa (%rsi), %xmm1 430; AVX2-NEXT: vmovdqa (%rdx), %xmm2 431; AVX2-NEXT: vmovdqa (%rcx), %xmm3 432; AVX2-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3 433; AVX2-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2 434; AVX2-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 435; AVX2-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 436; AVX2-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 437; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 438; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] 439; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] 440; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] 441; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 442; AVX2-NEXT: vmovdqa %ymm0, (%rax) 443; AVX2-NEXT: vzeroupper 444; AVX2-NEXT: retq 445; 446; AVX2-FP-LABEL: store_i8_stride8_vf4: 447; AVX2-FP: # %bb.0: 448; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 449; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 450; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11 451; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 452; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1 453; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2 454; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm3 455; AVX2-FP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3 456; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2 457; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 458; AVX2-FP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 459; AVX2-FP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 460; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 461; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] 462; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] 463; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] 464; AVX2-FP-NEXT: vpermd %ymm0, %ymm1, %ymm0 465; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax) 466; AVX2-FP-NEXT: vzeroupper 467; AVX2-FP-NEXT: retq 468; 469; AVX2-FCP-LABEL: store_i8_stride8_vf4: 470; AVX2-FCP: # %bb.0: 471; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 472; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 473; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 474; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 475; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 476; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2 477; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm3 478; AVX2-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3 479; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2 480; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 481; AVX2-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 482; AVX2-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 483; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 484; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] 485; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] 486; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] 487; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 488; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) 489; AVX2-FCP-NEXT: vzeroupper 490; AVX2-FCP-NEXT: retq 491; 492; AVX512-LABEL: store_i8_stride8_vf4: 493; AVX512: # %bb.0: 494; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 495; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 496; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 497; AVX512-NEXT: vmovdqa (%rdi), %xmm0 498; AVX512-NEXT: vmovdqa (%rsi), %xmm1 499; AVX512-NEXT: vmovdqa (%rdx), %xmm2 500; AVX512-NEXT: vmovdqa (%rcx), %xmm3 501; AVX512-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3 502; AVX512-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2 503; AVX512-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 504; AVX512-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 505; AVX512-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 506; AVX512-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 507; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] 508; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] 509; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] 510; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 511; AVX512-NEXT: vmovdqa %ymm0, (%rax) 512; AVX512-NEXT: vzeroupper 513; AVX512-NEXT: retq 514; 515; AVX512-FCP-LABEL: store_i8_stride8_vf4: 516; AVX512-FCP: # %bb.0: 517; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 518; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 519; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 520; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 521; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 522; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 523; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3 524; AVX512-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3 525; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2 526; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 527; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 528; AVX512-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 529; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 530; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] 531; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] 532; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] 533; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 534; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax) 535; AVX512-FCP-NEXT: vzeroupper 536; AVX512-FCP-NEXT: retq 537; 538; AVX512DQ-LABEL: store_i8_stride8_vf4: 539; AVX512DQ: # %bb.0: 540; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 541; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 542; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 543; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 544; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 545; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 546; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3 547; AVX512DQ-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3 548; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2 549; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 550; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 551; AVX512DQ-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 552; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 553; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] 554; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] 555; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] 556; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 557; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax) 558; AVX512DQ-NEXT: vzeroupper 559; AVX512DQ-NEXT: retq 560; 561; AVX512DQ-FCP-LABEL: store_i8_stride8_vf4: 562; AVX512DQ-FCP: # %bb.0: 563; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 564; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 565; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 566; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 567; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 568; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 569; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3 570; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3 571; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2 572; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 573; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 574; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 575; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 576; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] 577; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] 578; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] 579; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 580; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax) 581; AVX512DQ-FCP-NEXT: vzeroupper 582; AVX512DQ-FCP-NEXT: retq 583; 584; AVX512BW-LABEL: store_i8_stride8_vf4: 585; AVX512BW: # %bb.0: 586; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 587; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 588; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 589; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 590; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 591; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2 592; AVX512BW-NEXT: vmovdqa (%rcx), %xmm3 593; AVX512BW-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3 594; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2 595; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 596; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 597; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 598; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 599; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] 600; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] 601; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] 602; AVX512BW-NEXT: vpermd %ymm0, %ymm1, %ymm0 603; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) 604; AVX512BW-NEXT: vzeroupper 605; AVX512BW-NEXT: retq 606; 607; AVX512BW-FCP-LABEL: store_i8_stride8_vf4: 608; AVX512BW-FCP: # %bb.0: 609; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 610; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 611; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 612; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 613; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 614; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 615; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 616; AVX512BW-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3 617; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2 618; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 619; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 620; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 621; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 622; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] 623; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] 624; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] 625; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 626; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) 627; AVX512BW-FCP-NEXT: vzeroupper 628; AVX512BW-FCP-NEXT: retq 629; 630; AVX512DQ-BW-LABEL: store_i8_stride8_vf4: 631; AVX512DQ-BW: # %bb.0: 632; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 633; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 634; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 635; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 636; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1 637; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2 638; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm3 639; AVX512DQ-BW-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3 640; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2 641; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 642; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 643; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 644; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 645; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] 646; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] 647; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] 648; AVX512DQ-BW-NEXT: vpermd %ymm0, %ymm1, %ymm0 649; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax) 650; AVX512DQ-BW-NEXT: vzeroupper 651; AVX512DQ-BW-NEXT: retq 652; 653; AVX512DQ-BW-FCP-LABEL: store_i8_stride8_vf4: 654; AVX512DQ-BW-FCP: # %bb.0: 655; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 656; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 657; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 658; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 659; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 660; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 661; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 662; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3 663; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2 664; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] 665; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 666; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 667; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 668; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] 669; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] 670; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] 671; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 672; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) 673; AVX512DQ-BW-FCP-NEXT: vzeroupper 674; AVX512DQ-BW-FCP-NEXT: retq 675 %in.vec0 = load <4 x i8>, ptr %in.vecptr0, align 64 676 %in.vec1 = load <4 x i8>, ptr %in.vecptr1, align 64 677 %in.vec2 = load <4 x i8>, ptr %in.vecptr2, align 64 678 %in.vec3 = load <4 x i8>, ptr %in.vecptr3, align 64 679 %in.vec4 = load <4 x i8>, ptr %in.vecptr4, align 64 680 %in.vec5 = load <4 x i8>, ptr %in.vecptr5, align 64 681 %in.vec6 = load <4 x i8>, ptr %in.vecptr6, align 64 682 %in.vec7 = load <4 x i8>, ptr %in.vecptr7, align 64 683 %1 = shufflevector <4 x i8> %in.vec0, <4 x i8> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 684 %2 = shufflevector <4 x i8> %in.vec2, <4 x i8> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 685 %3 = shufflevector <4 x i8> %in.vec4, <4 x i8> %in.vec5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 686 %4 = shufflevector <4 x i8> %in.vec6, <4 x i8> %in.vec7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 687 %5 = shufflevector <8 x i8> %1, <8 x i8> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 688 %6 = shufflevector <8 x i8> %3, <8 x i8> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 689 %7 = shufflevector <16 x i8> %5, <16 x i8> %6, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 690 %interleaved.vec = shufflevector <32 x i8> %7, <32 x i8> poison, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 691 store <32 x i8> %interleaved.vec, ptr %out.vec, align 64 692 ret void 693} 694 695define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { 696; SSE-LABEL: store_i8_stride8_vf8: 697; SSE: # %bb.0: 698; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 699; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 700; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 701; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 702; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 703; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 704; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 705; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 706; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 707; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 708; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero 709; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 710; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero 711; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero 712; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 713; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,2,1,4,5,6,7] 714; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,1] 715; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,0] 716; SSE-NEXT: movdqa %xmm3, %xmm6 717; SSE-NEXT: pandn %xmm4, %xmm6 718; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7] 719; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] 720; SSE-NEXT: pand %xmm3, %xmm4 721; SSE-NEXT: por %xmm6, %xmm4 722; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,3,2,3] 723; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,0,2,1,4,5,6,7] 724; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,1,3] 725; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,0,65535,65535] 726; SSE-NEXT: movdqa %xmm4, %xmm8 727; SSE-NEXT: pandn %xmm6, %xmm8 728; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0] 729; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] 730; SSE-NEXT: pand %xmm4, %xmm6 731; SSE-NEXT: por %xmm8, %xmm6 732; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 733; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 734; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] 735; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] 736; SSE-NEXT: movdqa %xmm3, %xmm8 737; SSE-NEXT: pandn %xmm7, %xmm8 738; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] 739; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] 740; SSE-NEXT: pand %xmm3, %xmm7 741; SSE-NEXT: por %xmm8, %xmm7 742; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,3,2,3] 743; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,2,2,3,4,5,6,7] 744; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] 745; SSE-NEXT: movdqa %xmm4, %xmm9 746; SSE-NEXT: pandn %xmm7, %xmm9 747; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] 748; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] 749; SSE-NEXT: pand %xmm4, %xmm7 750; SSE-NEXT: por %xmm9, %xmm7 751; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] 752; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] 753; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,4,6,5] 754; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 755; SSE-NEXT: movdqa %xmm3, %xmm9 756; SSE-NEXT: pandn %xmm8, %xmm9 757; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,7] 758; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 759; SSE-NEXT: pand %xmm3, %xmm8 760; SSE-NEXT: por %xmm9, %xmm8 761; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,3,2,3] 762; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,4,4,6,5] 763; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,3,3] 764; SSE-NEXT: movdqa %xmm4, %xmm10 765; SSE-NEXT: pandn %xmm9, %xmm10 766; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] 767; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] 768; SSE-NEXT: pand %xmm4, %xmm9 769; SSE-NEXT: por %xmm10, %xmm9 770; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] 771; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] 772; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] 773; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 774; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] 775; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 776; SSE-NEXT: pand %xmm3, %xmm2 777; SSE-NEXT: pandn %xmm5, %xmm3 778; SSE-NEXT: por %xmm2, %xmm3 779; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] 780; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 781; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 782; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 783; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 784; SSE-NEXT: pand %xmm4, %xmm0 785; SSE-NEXT: pandn %xmm1, %xmm4 786; SSE-NEXT: por %xmm0, %xmm4 787; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] 788; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 789; SSE-NEXT: movdqa %xmm0, 48(%rax) 790; SSE-NEXT: movdqa %xmm9, 32(%rax) 791; SSE-NEXT: movdqa %xmm7, 16(%rax) 792; SSE-NEXT: movdqa %xmm6, (%rax) 793; SSE-NEXT: retq 794; 795; AVX-LABEL: store_i8_stride8_vf8: 796; AVX: # %bb.0: 797; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 798; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 799; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11 800; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 801; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 802; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 803; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 804; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 805; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 806; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 807; AVX-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 808; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 809; AVX-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 810; AVX-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 811; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 812; AVX-NEXT: vmovq {{.*#+}} xmm4 = [0,0,2,10,0,0,3,11,0,0,0,0,0,0,0,0] 813; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 814; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 815; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 816; AVX-NEXT: vmovq {{.*#+}} xmm5 = [2,10,0,0,3,11,0,0,0,0,0,0,0,0,0,0] 817; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 818; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 819; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 820; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] 821; AVX-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,8,0,0,1,9,0,0,0,0,0,0,0,0] 822; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm6 823; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm5 824; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 825; AVX-NEXT: vmovq {{.*#+}} xmm6 = [0,8,0,0,1,9,0,0,0,0,0,0,0,0,0,0] 826; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm7 827; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm6 828; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] 829; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7] 830; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 831; AVX-NEXT: vmovq {{.*#+}} xmm5 = [0,0,6,14,0,0,7,15,0,0,0,0,0,0,0,0] 832; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm6 833; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm5 834; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 835; AVX-NEXT: vmovq {{.*#+}} xmm6 = [6,14,0,0,7,15,0,0,0,0,0,0,0,0,0,0] 836; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm7 837; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm6 838; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] 839; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7] 840; AVX-NEXT: vmovq {{.*#+}} xmm6 = [0,0,4,12,0,0,5,13,0,0,0,0,0,0,0,0] 841; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm3 842; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm2 843; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 844; AVX-NEXT: vmovq {{.*#+}} xmm3 = [4,12,0,0,5,13,0,0,0,0,0,0,0,0,0,0] 845; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 846; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 847; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 848; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 849; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 850; AVX-NEXT: vmovaps %ymm0, 32(%rax) 851; AVX-NEXT: vmovaps %ymm4, (%rax) 852; AVX-NEXT: vzeroupper 853; AVX-NEXT: retq 854; 855; AVX2-LABEL: store_i8_stride8_vf8: 856; AVX2: # %bb.0: 857; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 858; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 859; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 860; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 861; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 862; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 863; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 864; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 865; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 866; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 867; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 868; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 869; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 870; AVX2-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 871; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 872; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 873; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 874; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,26],zero,zero,zero,zero,zero,zero,ymm1[19,27] 875; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] 876; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,ymm3[18,26],zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero 877; AVX2-NEXT: vpor %ymm4, %ymm2, %ymm2 878; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero 879; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] 880; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,zero 881; AVX2-NEXT: vpor %ymm6, %ymm4, %ymm4 882; AVX2-NEXT: vpor %ymm2, %ymm4, %ymm2 883; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,zero,zero,ymm1[23,31] 884; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[4,12],zero,zero,zero,zero,zero,zero,ymm3[5,13],zero,zero,zero,zero,ymm3[22,30],zero,zero,zero,zero,zero,zero,ymm3[23,31],zero,zero 885; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 886; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero 887; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31],zero,zero,zero,zero,zero,zero 888; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0 889; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 890; AVX2-NEXT: vmovdqa %ymm0, 32(%rax) 891; AVX2-NEXT: vmovdqa %ymm2, (%rax) 892; AVX2-NEXT: vzeroupper 893; AVX2-NEXT: retq 894; 895; AVX2-FP-LABEL: store_i8_stride8_vf8: 896; AVX2-FP: # %bb.0: 897; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 898; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 899; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11 900; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 901; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 902; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 903; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 904; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 905; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 906; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 907; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 908; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 909; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 910; AVX2-FP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 911; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 912; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 913; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 914; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,26],zero,zero,zero,zero,zero,zero,ymm1[19,27] 915; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] 916; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,ymm3[18,26],zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero 917; AVX2-FP-NEXT: vpor %ymm4, %ymm2, %ymm2 918; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero 919; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] 920; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,zero 921; AVX2-FP-NEXT: vpor %ymm6, %ymm4, %ymm4 922; AVX2-FP-NEXT: vpor %ymm2, %ymm4, %ymm2 923; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,zero,zero,ymm1[23,31] 924; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[4,12],zero,zero,zero,zero,zero,zero,ymm3[5,13],zero,zero,zero,zero,ymm3[22,30],zero,zero,zero,zero,zero,zero,ymm3[23,31],zero,zero 925; AVX2-FP-NEXT: vpor %ymm3, %ymm1, %ymm1 926; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero 927; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31],zero,zero,zero,zero,zero,zero 928; AVX2-FP-NEXT: vpor %ymm3, %ymm0, %ymm0 929; AVX2-FP-NEXT: vpor %ymm1, %ymm0, %ymm0 930; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rax) 931; AVX2-FP-NEXT: vmovdqa %ymm2, (%rax) 932; AVX2-FP-NEXT: vzeroupper 933; AVX2-FP-NEXT: retq 934; 935; AVX2-FCP-LABEL: store_i8_stride8_vf8: 936; AVX2-FCP: # %bb.0: 937; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 938; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 939; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 940; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 941; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 942; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 943; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 944; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 945; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 946; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 947; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 948; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 949; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 950; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 951; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 952; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 953; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 954; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] 955; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] 956; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm3 957; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15] 958; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 959; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 960; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [201851904,218694913,235537922,252380931] 961; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 962; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] 963; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] 964; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] 965; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 966; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 967; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 968; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 969; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 970; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rax) 971; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rax) 972; AVX2-FCP-NEXT: vzeroupper 973; AVX2-FCP-NEXT: retq 974; 975; AVX512-LABEL: store_i8_stride8_vf8: 976; AVX512: # %bb.0: 977; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 978; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 979; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 980; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 981; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 982; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 983; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 984; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 985; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 986; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 987; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 988; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 989; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 990; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 991; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 992; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 993; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 994; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31] 995; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm3 996; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm2 997; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] 998; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27] 999; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm4 1000; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm3 1001; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] 1002; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 1003; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] 1004; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,4,12,128,128,4,12,128,128,5,13,128,128,5,13,22,30,128,128,22,30,128,128,23,31,128,128,23,31,128,128] 1005; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm4 1006; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 1007; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm3 1008; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] 1009; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128] 1010; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1 1011; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 1012; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 1013; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 1014; AVX512-NEXT: vpord %zmm0, %zmm2, %zmm0 1015; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) 1016; AVX512-NEXT: vzeroupper 1017; AVX512-NEXT: retq 1018; 1019; AVX512-FCP-LABEL: store_i8_stride8_vf8: 1020; AVX512-FCP: # %bb.0: 1021; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1022; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1023; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 1024; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1025; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1026; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1027; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1028; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1029; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1030; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1031; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1032; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 1033; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1034; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 1035; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 1036; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1037; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 1038; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7] 1039; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] 1040; AVX512-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm3 1041; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15] 1042; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 1043; AVX512-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 1044; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [201851904,218694913,235537922,252380931] 1045; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 1046; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] 1047; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] 1048; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] 1049; AVX512-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 1050; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 1051; AVX512-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 1052; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 1053; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 1054; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 1055; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 1056; AVX512-FCP-NEXT: vzeroupper 1057; AVX512-FCP-NEXT: retq 1058; 1059; AVX512DQ-LABEL: store_i8_stride8_vf8: 1060; AVX512DQ: # %bb.0: 1061; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 1062; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 1063; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 1064; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1065; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1066; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1067; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1068; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1069; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1070; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1071; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1072; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 1073; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1074; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 1075; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 1076; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1077; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 1078; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31] 1079; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm3 1080; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm2 1081; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] 1082; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27] 1083; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm4 1084; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm3 1085; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] 1086; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 1087; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] 1088; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,4,12,128,128,4,12,128,128,5,13,128,128,5,13,22,30,128,128,22,30,128,128,23,31,128,128,23,31,128,128] 1089; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm4 1090; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 1091; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm3 1092; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] 1093; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128] 1094; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1 1095; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0 1096; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 1097; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 1098; AVX512DQ-NEXT: vpord %zmm0, %zmm2, %zmm0 1099; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) 1100; AVX512DQ-NEXT: vzeroupper 1101; AVX512DQ-NEXT: retq 1102; 1103; AVX512DQ-FCP-LABEL: store_i8_stride8_vf8: 1104; AVX512DQ-FCP: # %bb.0: 1105; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1106; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1107; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 1108; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1109; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1110; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1111; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1112; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1113; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1114; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1115; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1116; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 1117; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1118; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 1119; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 1120; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1121; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 1122; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7] 1123; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] 1124; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm3 1125; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15] 1126; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 1127; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 1128; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [201851904,218694913,235537922,252380931] 1129; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 1130; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] 1131; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] 1132; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] 1133; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 1134; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 1135; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 1136; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 1137; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 1138; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 1139; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 1140; AVX512DQ-FCP-NEXT: vzeroupper 1141; AVX512DQ-FCP-NEXT: retq 1142; 1143; AVX512BW-LABEL: store_i8_stride8_vf8: 1144; AVX512BW: # %bb.0: 1145; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1146; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 1147; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 1148; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1149; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1150; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1151; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1152; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1153; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1154; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1155; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1156; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 1157; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1158; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 1159; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 1160; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1161; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1162; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 1163; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 1164; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] 1165; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] 1166; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] 1167; AVX512BW-NEXT: movl $287445282, %ecx # imm = 0x11221122 1168; AVX512BW-NEXT: kmovd %ecx, %k1 1169; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} 1170; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,4,5,6,7] 1171; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] 1172; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,0,1,2,3,0,1] 1173; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] 1174; AVX512BW-NEXT: movl $1149781128, %ecx # imm = 0x44884488 1175; AVX512BW-NEXT: kmovd %ecx, %k1 1176; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} 1177; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA 1178; AVX512BW-NEXT: kmovd %ecx, %k1 1179; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 1180; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) 1181; AVX512BW-NEXT: vzeroupper 1182; AVX512BW-NEXT: retq 1183; 1184; AVX512BW-FCP-LABEL: store_i8_stride8_vf8: 1185; AVX512BW-FCP: # %bb.0: 1186; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1187; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1188; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 1189; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1190; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1191; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1192; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1193; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1194; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1195; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1196; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1197; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 1198; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1199; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 1200; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 1201; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1202; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 1203; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 1204; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,10,12,14,8,10,12,14,9,11,13,15,9,11,13,15] 1205; AVX512BW-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm1 1206; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63] 1207; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,0,2,4,6,1,3,5,7,1,3,5,7] 1208; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 1209; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63,u,u,u,u] 1210; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA 1211; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 1212; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} 1213; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 1214; AVX512BW-FCP-NEXT: vzeroupper 1215; AVX512BW-FCP-NEXT: retq 1216; 1217; AVX512DQ-BW-LABEL: store_i8_stride8_vf8: 1218; AVX512DQ-BW: # %bb.0: 1219; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1220; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 1221; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 1222; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1223; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1224; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1225; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1226; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1227; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1228; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1229; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1230; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 1231; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1232; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 1233; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 1234; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1235; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1236; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 1237; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 1238; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] 1239; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] 1240; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] 1241; AVX512DQ-BW-NEXT: movl $287445282, %ecx # imm = 0x11221122 1242; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 1243; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} 1244; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,4,5,6,7] 1245; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] 1246; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,0,1,2,3,0,1] 1247; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] 1248; AVX512DQ-BW-NEXT: movl $1149781128, %ecx # imm = 0x44884488 1249; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 1250; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} 1251; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA 1252; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 1253; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 1254; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) 1255; AVX512DQ-BW-NEXT: vzeroupper 1256; AVX512DQ-BW-NEXT: retq 1257; 1258; AVX512DQ-BW-FCP-LABEL: store_i8_stride8_vf8: 1259; AVX512DQ-BW-FCP: # %bb.0: 1260; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1261; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1262; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 1263; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1264; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1265; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1266; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1267; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1268; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1269; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1270; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1271; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 1272; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1273; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 1274; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 1275; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1276; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 1277; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 1278; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,10,12,14,8,10,12,14,9,11,13,15,9,11,13,15] 1279; AVX512DQ-BW-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm1 1280; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63] 1281; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,0,2,4,6,1,3,5,7,1,3,5,7] 1282; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 1283; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63,u,u,u,u] 1284; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA 1285; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 1286; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} 1287; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 1288; AVX512DQ-BW-FCP-NEXT: vzeroupper 1289; AVX512DQ-BW-FCP-NEXT: retq 1290 %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64 1291 %in.vec1 = load <8 x i8>, ptr %in.vecptr1, align 64 1292 %in.vec2 = load <8 x i8>, ptr %in.vecptr2, align 64 1293 %in.vec3 = load <8 x i8>, ptr %in.vecptr3, align 64 1294 %in.vec4 = load <8 x i8>, ptr %in.vecptr4, align 64 1295 %in.vec5 = load <8 x i8>, ptr %in.vecptr5, align 64 1296 %in.vec6 = load <8 x i8>, ptr %in.vecptr6, align 64 1297 %in.vec7 = load <8 x i8>, ptr %in.vecptr7, align 64 1298 %1 = shufflevector <8 x i8> %in.vec0, <8 x i8> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1299 %2 = shufflevector <8 x i8> %in.vec2, <8 x i8> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1300 %3 = shufflevector <8 x i8> %in.vec4, <8 x i8> %in.vec5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1301 %4 = shufflevector <8 x i8> %in.vec6, <8 x i8> %in.vec7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1302 %5 = shufflevector <16 x i8> %1, <16 x i8> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1303 %6 = shufflevector <16 x i8> %3, <16 x i8> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1304 %7 = shufflevector <32 x i8> %5, <32 x i8> %6, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 1305 %interleaved.vec = shufflevector <64 x i8> %7, <64 x i8> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63> 1306 store <64 x i8> %interleaved.vec, ptr %out.vec, align 64 1307 ret void 1308} 1309 1310define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { 1311; SSE-LABEL: store_i8_stride8_vf16: 1312; SSE: # %bb.0: 1313; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 1314; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 1315; SSE-NEXT: movdqa (%rdi), %xmm10 1316; SSE-NEXT: movdqa (%rsi), %xmm9 1317; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1318; SSE-NEXT: movdqa (%rdx), %xmm1 1319; SSE-NEXT: movdqa (%rcx), %xmm6 1320; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1321; SSE-NEXT: movdqa (%r8), %xmm3 1322; SSE-NEXT: movdqa (%r9), %xmm11 1323; SSE-NEXT: movdqa (%r10), %xmm4 1324; SSE-NEXT: movdqa (%rax), %xmm13 1325; SSE-NEXT: movdqa %xmm4, %xmm12 1326; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] 1327; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,0,2,1,4,5,6,7] 1328; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 1329; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,0] 1330; SSE-NEXT: movdqa %xmm2, %xmm5 1331; SSE-NEXT: pandn %xmm0, %xmm5 1332; SSE-NEXT: movdqa %xmm3, %xmm14 1333; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] 1334; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,1,3,4,5,6,7] 1335; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,1] 1336; SSE-NEXT: pand %xmm2, %xmm7 1337; SSE-NEXT: por %xmm5, %xmm7 1338; SSE-NEXT: movdqa %xmm1, %xmm15 1339; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3],xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7] 1340; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] 1341; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3] 1342; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,0,65535,65535] 1343; SSE-NEXT: movdqa %xmm0, %xmm8 1344; SSE-NEXT: pandn %xmm5, %xmm8 1345; SSE-NEXT: movdqa %xmm10, %xmm6 1346; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] 1347; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,0,0] 1348; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] 1349; SSE-NEXT: pand %xmm0, %xmm9 1350; SSE-NEXT: por %xmm8, %xmm9 1351; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] 1352; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] 1353; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] 1354; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1355; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[0,2,2,3,4,5,6,7] 1356; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] 1357; SSE-NEXT: movdqa %xmm2, %xmm8 1358; SSE-NEXT: pandn %xmm7, %xmm8 1359; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[2,1,3,3,4,5,6,7] 1360; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] 1361; SSE-NEXT: pand %xmm2, %xmm7 1362; SSE-NEXT: por %xmm8, %xmm7 1363; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm15[0,2,2,3,4,5,6,7] 1364; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] 1365; SSE-NEXT: movdqa %xmm0, %xmm9 1366; SSE-NEXT: pandn %xmm8, %xmm9 1367; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,1,1] 1368; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] 1369; SSE-NEXT: pand %xmm0, %xmm8 1370; SSE-NEXT: por %xmm9, %xmm8 1371; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,3,2,3] 1372; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,2,2,3] 1373; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 1374; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,4,6,6,7] 1375; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 1376; SSE-NEXT: movdqa %xmm2, %xmm9 1377; SSE-NEXT: pandn %xmm8, %xmm9 1378; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,6,5,7,7] 1379; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 1380; SSE-NEXT: pand %xmm2, %xmm8 1381; SSE-NEXT: por %xmm9, %xmm8 1382; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7] 1383; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,3,3] 1384; SSE-NEXT: movdqa %xmm0, %xmm5 1385; SSE-NEXT: pandn %xmm9, %xmm5 1386; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[3,3,3,3] 1387; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] 1388; SSE-NEXT: pand %xmm0, %xmm9 1389; SSE-NEXT: por %xmm5, %xmm9 1390; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,3,2,3] 1391; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,2,2,3] 1392; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] 1393; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,4,6,5] 1394; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 1395; SSE-NEXT: movdqa %xmm2, %xmm9 1396; SSE-NEXT: pandn %xmm5, %xmm9 1397; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,4,5,5,7] 1398; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 1399; SSE-NEXT: pand %xmm2, %xmm5 1400; SSE-NEXT: por %xmm9, %xmm5 1401; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,4,6,5] 1402; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,3,3] 1403; SSE-NEXT: movdqa %xmm0, %xmm12 1404; SSE-NEXT: pandn %xmm9, %xmm12 1405; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] 1406; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] 1407; SSE-NEXT: pand %xmm0, %xmm6 1408; SSE-NEXT: por %xmm12, %xmm6 1409; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] 1410; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,2,2,3] 1411; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] 1412; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15] 1413; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] 1414; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] 1415; SSE-NEXT: movdqa %xmm2, %xmm6 1416; SSE-NEXT: pandn %xmm5, %xmm6 1417; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] 1418; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] 1419; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] 1420; SSE-NEXT: pand %xmm2, %xmm5 1421; SSE-NEXT: por %xmm6, %xmm5 1422; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] 1423; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 1424; SSE-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] 1425; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,0,2,1,4,5,6,7] 1426; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,3] 1427; SSE-NEXT: movdqa %xmm0, %xmm9 1428; SSE-NEXT: pandn %xmm6, %xmm9 1429; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 1430; SSE-NEXT: # xmm10 = xmm10[8],mem[8],xmm10[9],mem[9],xmm10[10],mem[10],xmm10[11],mem[11],xmm10[12],mem[12],xmm10[13],mem[13],xmm10[14],mem[14],xmm10[15],mem[15] 1431; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,0,0,0] 1432; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] 1433; SSE-NEXT: pand %xmm0, %xmm6 1434; SSE-NEXT: por %xmm9, %xmm6 1435; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,2,2,3] 1436; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] 1437; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,2,2,3,4,5,6,7] 1438; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] 1439; SSE-NEXT: movdqa %xmm2, %xmm6 1440; SSE-NEXT: pandn %xmm5, %xmm6 1441; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] 1442; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] 1443; SSE-NEXT: pand %xmm2, %xmm5 1444; SSE-NEXT: por %xmm6, %xmm5 1445; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,3,2,3] 1446; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] 1447; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,3] 1448; SSE-NEXT: movdqa %xmm0, %xmm11 1449; SSE-NEXT: pandn %xmm5, %xmm11 1450; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,1,1] 1451; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] 1452; SSE-NEXT: pand %xmm0, %xmm5 1453; SSE-NEXT: por %xmm11, %xmm5 1454; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 1455; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 1456; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,6,6,7] 1457; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 1458; SSE-NEXT: movdqa %xmm2, %xmm11 1459; SSE-NEXT: pandn %xmm6, %xmm11 1460; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,6,5,7,7] 1461; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 1462; SSE-NEXT: pand %xmm2, %xmm6 1463; SSE-NEXT: por %xmm11, %xmm6 1464; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,4,6,6,7] 1465; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,3,3] 1466; SSE-NEXT: movdqa %xmm0, %xmm13 1467; SSE-NEXT: pandn %xmm11, %xmm13 1468; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[3,3,3,3] 1469; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] 1470; SSE-NEXT: pand %xmm0, %xmm11 1471; SSE-NEXT: por %xmm13, %xmm11 1472; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] 1473; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] 1474; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] 1475; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,5] 1476; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1477; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] 1478; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 1479; SSE-NEXT: pand %xmm2, %xmm3 1480; SSE-NEXT: pandn %xmm4, %xmm2 1481; SSE-NEXT: por %xmm3, %xmm2 1482; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,5] 1483; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 1484; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] 1485; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] 1486; SSE-NEXT: pand %xmm0, %xmm3 1487; SSE-NEXT: pandn %xmm1, %xmm0 1488; SSE-NEXT: por %xmm3, %xmm0 1489; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 1490; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1491; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1492; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 1493; SSE-NEXT: movdqa %xmm0, 96(%rax) 1494; SSE-NEXT: movdqa %xmm11, 112(%rax) 1495; SSE-NEXT: movdqa %xmm5, 80(%rax) 1496; SSE-NEXT: movdqa %xmm9, 64(%rax) 1497; SSE-NEXT: movdqa %xmm12, 32(%rax) 1498; SSE-NEXT: movdqa %xmm8, 48(%rax) 1499; SSE-NEXT: movdqa %xmm7, 16(%rax) 1500; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1501; SSE-NEXT: movaps %xmm0, (%rax) 1502; SSE-NEXT: retq 1503; 1504; AVX-LABEL: store_i8_stride8_vf16: 1505; AVX: # %bb.0: 1506; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 1507; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 1508; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11 1509; AVX-NEXT: vmovdqa (%rdi), %xmm1 1510; AVX-NEXT: vmovdqa (%rsi), %xmm2 1511; AVX-NEXT: vmovdqa (%rdx), %xmm3 1512; AVX-NEXT: vmovdqa (%rcx), %xmm4 1513; AVX-NEXT: vmovdqa (%r8), %xmm5 1514; AVX-NEXT: vmovdqa (%r9), %xmm6 1515; AVX-NEXT: vmovdqa (%r11), %xmm8 1516; AVX-NEXT: vmovdqa (%r10), %xmm9 1517; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] 1518; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,2,2,3,4,5,6,7] 1519; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 1520; AVX-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] 1521; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,1,3,3,4,5,6,7] 1522; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,0,2,1] 1523; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3],xmm11[4,5,6],xmm0[7] 1524; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,0,2,1,4,5,6,7] 1525; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,0,2,1] 1526; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[0,1,1,3,4,5,6,7] 1527; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,2,1] 1528; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3],xmm12[4,5,6],xmm11[7] 1529; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 1530; AVX-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1531; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,2,2,3,4,5,6,7] 1532; AVX-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero 1533; AVX-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1534; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,1,1,1] 1535; AVX-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero 1536; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3,4],xmm12[5],xmm14[6,7] 1537; AVX-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero 1538; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm11[0,0,2,1,4,5,6,7] 1539; AVX-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero 1540; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4],xmm15[5],xmm14[6,7] 1541; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 1542; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2],ymm0[3],ymm12[4],ymm0[5],ymm12[6],ymm0[7] 1543; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,6,6,7] 1544; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] 1545; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,6,5,7,7] 1546; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] 1547; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3],xmm14[4,5,6],xmm12[7] 1548; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,6,5] 1549; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] 1550; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,7] 1551; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] 1552; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3],xmm10[4,5,6],xmm7[7] 1553; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 1554; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,6,6,7] 1555; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,3,3] 1556; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3] 1557; AVX-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero 1558; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3,4],xmm10[5],xmm12[6,7] 1559; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,6,5] 1560; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,3,3] 1561; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,3,2,3] 1562; AVX-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero 1563; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4],xmm11[5],xmm12[6,7] 1564; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 1565; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2],ymm7[3],ymm10[4],ymm7[5],ymm10[6],ymm7[7] 1566; AVX-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] 1567; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,2,2,3,4,5,6,7] 1568; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,2,1] 1569; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] 1570; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,1,3,3,4,5,6,7] 1571; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] 1572; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm9[3],xmm6[4,5,6],xmm9[7] 1573; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,0,2,1,4,5,6,7] 1574; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,2,1] 1575; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,1,1,3,4,5,6,7] 1576; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] 1577; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4,5,6],xmm9[7] 1578; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm9, %ymm6 1579; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] 1580; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7] 1581; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 1582; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 1583; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 1584; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 1585; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4],xmm4[5],xmm2[6,7] 1586; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1587; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[0,0,2,1,4,5,6,7] 1588; AVX-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero 1589; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3,4],xmm9[5],xmm4[6,7] 1590; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 1591; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2],ymm6[3],ymm2[4],ymm6[5],ymm2[6],ymm6[7] 1592; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,6,6,7] 1593; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1594; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,5,7,7] 1595; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 1596; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5,6],xmm4[7] 1597; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,4,4,6,5] 1598; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 1599; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,7] 1600; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 1601; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6],xmm6[7] 1602; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 1603; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] 1604; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] 1605; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,3,3,3] 1606; AVX-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero 1607; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4],xmm5[5],xmm6[6,7] 1608; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,5] 1609; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] 1610; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1611; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1612; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4],xmm3[5],xmm1[6,7] 1613; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 1614; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] 1615; AVX-NEXT: vmovaps %ymm1, 96(%rax) 1616; AVX-NEXT: vmovaps %ymm2, 64(%rax) 1617; AVX-NEXT: vmovaps %ymm7, 32(%rax) 1618; AVX-NEXT: vmovaps %ymm0, (%rax) 1619; AVX-NEXT: vzeroupper 1620; AVX-NEXT: retq 1621; 1622; AVX2-LABEL: store_i8_stride8_vf16: 1623; AVX2: # %bb.0: 1624; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 1625; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 1626; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 1627; AVX2-NEXT: vmovdqa (%rdi), %xmm0 1628; AVX2-NEXT: vmovdqa (%rdx), %xmm1 1629; AVX2-NEXT: vmovdqa (%r8), %xmm2 1630; AVX2-NEXT: vmovdqa (%r11), %xmm3 1631; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1632; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1633; AVX2-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm4 1634; AVX2-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 1635; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,2,0,2] 1636; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] 1637; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm2 1638; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] 1639; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,2048,0,2305,0,2562,0,2819] 1640; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm9 1641; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] 1642; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] 1643; AVX2-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 1644; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm11 1645; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] 1646; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm13 = [2048,2305,2562,2819] 1647; AVX2-NEXT: vpshufb %ymm13, %ymm12, %ymm14 1648; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] 1649; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] 1650; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] 1651; AVX2-NEXT: vpshufb %ymm11, %ymm5, %ymm5 1652; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,3076,0,3333,0,3590,0,3847] 1653; AVX2-NEXT: vpshufb %ymm14, %ymm7, %ymm7 1654; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] 1655; AVX2-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 1656; AVX2-NEXT: vpshufb %ymm7, %ymm9, %ymm9 1657; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm15 = [3076,3333,3590,3847] 1658; AVX2-NEXT: vpshufb %ymm15, %ymm12, %ymm12 1659; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] 1660; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] 1661; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3] 1662; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm6 1663; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,1,3] 1664; AVX2-NEXT: vpshufb %ymm8, %ymm4, %ymm8 1665; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15] 1666; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] 1667; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm8 1668; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] 1669; AVX2-NEXT: vpshufb %ymm13, %ymm0, %ymm9 1670; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] 1671; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] 1672; AVX2-NEXT: vpshufb %ymm11, %ymm3, %ymm3 1673; AVX2-NEXT: vpshufb %ymm14, %ymm4, %ymm4 1674; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] 1675; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1 1676; AVX2-NEXT: vpshufb %ymm15, %ymm0, %ymm0 1677; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] 1678; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] 1679; AVX2-NEXT: vmovdqa %ymm0, 96(%rax) 1680; AVX2-NEXT: vmovdqa %ymm6, 64(%rax) 1681; AVX2-NEXT: vmovdqa %ymm5, 32(%rax) 1682; AVX2-NEXT: vmovdqa %ymm2, (%rax) 1683; AVX2-NEXT: vzeroupper 1684; AVX2-NEXT: retq 1685; 1686; AVX2-FP-LABEL: store_i8_stride8_vf16: 1687; AVX2-FP: # %bb.0: 1688; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1689; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1690; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11 1691; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 1692; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 1693; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 1694; AVX2-FP-NEXT: vmovdqa (%r11), %xmm3 1695; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1696; AVX2-FP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1697; AVX2-FP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm4 1698; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 1699; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,2,0,2] 1700; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] 1701; AVX2-FP-NEXT: vpshufb %ymm6, %ymm5, %ymm2 1702; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] 1703; AVX2-FP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,2048,0,2305,0,2562,0,2819] 1704; AVX2-FP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 1705; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] 1706; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] 1707; AVX2-FP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 1708; AVX2-FP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 1709; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] 1710; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [2048,2305,2562,2819] 1711; AVX2-FP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 1712; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] 1713; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] 1714; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] 1715; AVX2-FP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 1716; AVX2-FP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,3076,0,3333,0,3590,0,3847] 1717; AVX2-FP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 1718; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] 1719; AVX2-FP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 1720; AVX2-FP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 1721; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [3076,3333,3590,3847] 1722; AVX2-FP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 1723; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] 1724; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] 1725; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3] 1726; AVX2-FP-NEXT: vpshufb %ymm6, %ymm3, %ymm6 1727; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,1,3] 1728; AVX2-FP-NEXT: vpshufb %ymm8, %ymm4, %ymm8 1729; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15] 1730; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] 1731; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm8 1732; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] 1733; AVX2-FP-NEXT: vpshufb %ymm13, %ymm0, %ymm9 1734; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] 1735; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] 1736; AVX2-FP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 1737; AVX2-FP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 1738; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] 1739; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 1740; AVX2-FP-NEXT: vpshufb %ymm15, %ymm0, %ymm0 1741; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] 1742; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] 1743; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax) 1744; AVX2-FP-NEXT: vmovdqa %ymm6, 64(%rax) 1745; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%rax) 1746; AVX2-FP-NEXT: vmovdqa %ymm2, (%rax) 1747; AVX2-FP-NEXT: vzeroupper 1748; AVX2-FP-NEXT: retq 1749; 1750; AVX2-FCP-LABEL: store_i8_stride8_vf16: 1751; AVX2-FCP: # %bb.0: 1752; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1753; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1754; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 1755; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 1756; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 1757; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 1758; AVX2-FCP-NEXT: vmovdqa (%r11), %xmm3 1759; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1760; AVX2-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1761; AVX2-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm4 1762; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 1763; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,2,0,2] 1764; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] 1765; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm2 1766; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] 1767; AVX2-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,2048,0,2305,0,2562,0,2819] 1768; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 1769; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] 1770; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] 1771; AVX2-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 1772; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 1773; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] 1774; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [2048,2305,2562,2819] 1775; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 1776; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] 1777; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] 1778; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] 1779; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 1780; AVX2-FCP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,3076,0,3333,0,3590,0,3847] 1781; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 1782; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] 1783; AVX2-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 1784; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 1785; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [3076,3333,3590,3847] 1786; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 1787; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] 1788; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] 1789; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3] 1790; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm6 1791; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,1,3] 1792; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm8 1793; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15] 1794; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] 1795; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm8 1796; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] 1797; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm9 1798; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] 1799; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] 1800; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 1801; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 1802; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] 1803; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 1804; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0 1805; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] 1806; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] 1807; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax) 1808; AVX2-FCP-NEXT: vmovdqa %ymm6, 64(%rax) 1809; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%rax) 1810; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rax) 1811; AVX2-FCP-NEXT: vzeroupper 1812; AVX2-FCP-NEXT: retq 1813; 1814; AVX512-LABEL: store_i8_stride8_vf16: 1815; AVX512: # %bb.0: 1816; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 1817; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 1818; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 1819; AVX512-NEXT: vmovdqa (%rdi), %xmm0 1820; AVX512-NEXT: vmovdqa (%rdx), %xmm1 1821; AVX512-NEXT: vmovdqa (%r8), %xmm2 1822; AVX512-NEXT: vmovdqa (%r11), %xmm3 1823; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1824; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1825; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 1826; AVX512-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 1827; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] 1828; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] 1829; AVX512-NEXT: vpshufb %ymm5, %ymm4, %ymm6 1830; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] 1831; AVX512-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847] 1832; AVX512-NEXT: vpshufb %ymm8, %ymm7, %ymm9 1833; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] 1834; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] 1835; AVX512-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 1836; AVX512-NEXT: vpshufb %ymm10, %ymm9, %ymm11 1837; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] 1838; AVX512-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847] 1839; AVX512-NEXT: vpshufb %ymm13, %ymm12, %ymm14 1840; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] 1841; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7] 1842; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] 1843; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4 1844; AVX512-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819] 1845; AVX512-NEXT: vpshufb %ymm14, %ymm7, %ymm7 1846; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] 1847; AVX512-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 1848; AVX512-NEXT: vpshufb %ymm7, %ymm9, %ymm9 1849; AVX512-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819] 1850; AVX512-NEXT: vpshufb %ymm15, %ymm12, %ymm12 1851; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] 1852; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] 1853; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 1854; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3] 1855; AVX512-NEXT: vpshufb %ymm5, %ymm3, %ymm5 1856; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] 1857; AVX512-NEXT: vpshufb %ymm8, %ymm2, %ymm6 1858; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15] 1859; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] 1860; AVX512-NEXT: vpshufb %ymm10, %ymm1, %ymm6 1861; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] 1862; AVX512-NEXT: vpshufb %ymm13, %ymm0, %ymm8 1863; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15] 1864; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] 1865; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3 1866; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm2 1867; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15] 1868; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm1 1869; AVX512-NEXT: vpshufb %ymm15, %ymm0, %ymm0 1870; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] 1871; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] 1872; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 1873; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax) 1874; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) 1875; AVX512-NEXT: vzeroupper 1876; AVX512-NEXT: retq 1877; 1878; AVX512-FCP-LABEL: store_i8_stride8_vf16: 1879; AVX512-FCP: # %bb.0: 1880; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1881; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1882; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 1883; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 1884; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 1885; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 1886; AVX512-FCP-NEXT: vmovdqa (%r11), %xmm3 1887; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1888; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1889; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 1890; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 1891; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] 1892; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] 1893; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 1894; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] 1895; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847] 1896; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 1897; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] 1898; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] 1899; AVX512-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 1900; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 1901; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] 1902; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847] 1903; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 1904; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] 1905; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7] 1906; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] 1907; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 1908; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819] 1909; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 1910; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] 1911; AVX512-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 1912; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 1913; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819] 1914; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 1915; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] 1916; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] 1917; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 1918; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3] 1919; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5 1920; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] 1921; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm6 1922; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15] 1923; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] 1924; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm6 1925; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] 1926; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm8 1927; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15] 1928; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] 1929; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 1930; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 1931; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15] 1932; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 1933; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0 1934; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] 1935; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] 1936; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 1937; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) 1938; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 1939; AVX512-FCP-NEXT: vzeroupper 1940; AVX512-FCP-NEXT: retq 1941; 1942; AVX512DQ-LABEL: store_i8_stride8_vf16: 1943; AVX512DQ: # %bb.0: 1944; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 1945; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 1946; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 1947; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 1948; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 1949; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 1950; AVX512DQ-NEXT: vmovdqa (%r11), %xmm3 1951; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1952; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1953; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 1954; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 1955; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] 1956; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] 1957; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm6 1958; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] 1959; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847] 1960; AVX512DQ-NEXT: vpshufb %ymm8, %ymm7, %ymm9 1961; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] 1962; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] 1963; AVX512DQ-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 1964; AVX512DQ-NEXT: vpshufb %ymm10, %ymm9, %ymm11 1965; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] 1966; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847] 1967; AVX512DQ-NEXT: vpshufb %ymm13, %ymm12, %ymm14 1968; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] 1969; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7] 1970; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] 1971; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4 1972; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819] 1973; AVX512DQ-NEXT: vpshufb %ymm14, %ymm7, %ymm7 1974; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] 1975; AVX512DQ-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 1976; AVX512DQ-NEXT: vpshufb %ymm7, %ymm9, %ymm9 1977; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819] 1978; AVX512DQ-NEXT: vpshufb %ymm15, %ymm12, %ymm12 1979; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] 1980; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] 1981; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 1982; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3] 1983; AVX512DQ-NEXT: vpshufb %ymm5, %ymm3, %ymm5 1984; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] 1985; AVX512DQ-NEXT: vpshufb %ymm8, %ymm2, %ymm6 1986; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15] 1987; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] 1988; AVX512DQ-NEXT: vpshufb %ymm10, %ymm1, %ymm6 1989; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] 1990; AVX512DQ-NEXT: vpshufb %ymm13, %ymm0, %ymm8 1991; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15] 1992; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] 1993; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3 1994; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm2 1995; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15] 1996; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm1 1997; AVX512DQ-NEXT: vpshufb %ymm15, %ymm0, %ymm0 1998; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] 1999; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] 2000; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 2001; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax) 2002; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) 2003; AVX512DQ-NEXT: vzeroupper 2004; AVX512DQ-NEXT: retq 2005; 2006; AVX512DQ-FCP-LABEL: store_i8_stride8_vf16: 2007; AVX512DQ-FCP: # %bb.0: 2008; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2009; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 2010; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 2011; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 2012; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 2013; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 2014; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %xmm3 2015; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 2016; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 2017; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 2018; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 2019; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] 2020; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] 2021; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 2022; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] 2023; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847] 2024; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 2025; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] 2026; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] 2027; AVX512DQ-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 2028; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 2029; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] 2030; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847] 2031; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 2032; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] 2033; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7] 2034; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] 2035; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 2036; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819] 2037; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 2038; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] 2039; AVX512DQ-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 2040; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 2041; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819] 2042; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 2043; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] 2044; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] 2045; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 2046; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3] 2047; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5 2048; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] 2049; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm6 2050; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15] 2051; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] 2052; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm6 2053; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] 2054; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm8 2055; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15] 2056; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] 2057; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 2058; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 2059; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15] 2060; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 2061; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0 2062; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] 2063; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] 2064; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 2065; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) 2066; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 2067; AVX512DQ-FCP-NEXT: vzeroupper 2068; AVX512DQ-FCP-NEXT: retq 2069; 2070; AVX512BW-LABEL: store_i8_stride8_vf16: 2071; AVX512BW: # %bb.0: 2072; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 2073; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 2074; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 2075; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 2076; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 2077; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 2078; AVX512BW-NEXT: vmovdqa (%r11), %xmm3 2079; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 2080; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 2081; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 2082; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 2083; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 2084; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 2085; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 2086; AVX512BW-NEXT: vpermq {{.*#+}} zmm4 = zmm2[0,2,0,2,4,6,4,6] 2087; AVX512BW-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] 2088; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7] 2089; AVX512BW-NEXT: vpermq {{.*#+}} zmm5 = zmm3[0,2,0,2,4,6,4,6] 2090; AVX512BW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] 2091; AVX512BW-NEXT: movl $-2004318072, %ecx # imm = 0x88888888 2092; AVX512BW-NEXT: kmovd %ecx, %k1 2093; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm4 {%k1} 2094; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 2095; AVX512BW-NEXT: vpermq {{.*#+}} zmm5 = zmm0[0,2,0,2,4,6,4,6] 2096; AVX512BW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] 2097; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] 2098; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm1[0,2,0,2,4,6,4,6] 2099; AVX512BW-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] 2100; AVX512BW-NEXT: movl $572662306, %ecx # imm = 0x22222222 2101; AVX512BW-NEXT: kmovd %ecx, %k2 2102; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm5 {%k2} 2103; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA 2104; AVX512BW-NEXT: kmovd %ecx, %k3 2105; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k3} 2106; AVX512BW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7] 2107; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] 2108; AVX512BW-NEXT: vpermq {{.*#+}} zmm3 = zmm3[1,3,1,3,5,7,5,7] 2109; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] 2110; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1} 2111; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] 2112; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] 2113; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7] 2114; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] 2115; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} 2116; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} 2117; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) 2118; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) 2119; AVX512BW-NEXT: vzeroupper 2120; AVX512BW-NEXT: retq 2121; 2122; AVX512BW-FCP-LABEL: store_i8_stride8_vf16: 2123; AVX512BW-FCP: # %bb.0: 2124; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2125; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 2126; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 2127; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 2128; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 2129; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 2130; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3 2131; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 2132; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 2133; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 2134; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 2135; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 2136; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 2137; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14] 2138; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 2139; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] 2140; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 2141; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm2[0,2,0,2,4,6,4,6] 2142; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] 2143; AVX512BW-FCP-NEXT: movl $-2004318072, %ecx # imm = 0x88888888 2144; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 2145; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm7 {%k1} 2146; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 2147; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] 2148; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 2149; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,2,0,2,4,6,4,6] 2150; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] 2151; AVX512BW-FCP-NEXT: movl $572662306, %ecx # imm = 0x22222222 2152; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 2153; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k2} 2154; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA 2155; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 2156; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3} 2157; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7] 2158; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2159; AVX512BW-FCP-NEXT: vpermq %zmm5, %zmm1, %zmm5 2160; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] 2161; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7] 2162; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] 2163; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm2 {%k1} 2164; AVX512BW-FCP-NEXT: vpermq %zmm4, %zmm1, %zmm1 2165; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] 2166; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] 2167; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] 2168; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} 2169; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} 2170; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) 2171; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) 2172; AVX512BW-FCP-NEXT: vzeroupper 2173; AVX512BW-FCP-NEXT: retq 2174; 2175; AVX512DQ-BW-LABEL: store_i8_stride8_vf16: 2176; AVX512DQ-BW: # %bb.0: 2177; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 2178; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 2179; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 2180; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 2181; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 2182; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 2183; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm3 2184; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 2185; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 2186; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 2187; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 2188; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 2189; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 2190; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 2191; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm4 = zmm2[0,2,0,2,4,6,4,6] 2192; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] 2193; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7] 2194; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm5 = zmm3[0,2,0,2,4,6,4,6] 2195; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] 2196; AVX512DQ-BW-NEXT: movl $-2004318072, %ecx # imm = 0x88888888 2197; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 2198; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm4 {%k1} 2199; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 2200; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm5 = zmm0[0,2,0,2,4,6,4,6] 2201; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] 2202; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] 2203; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm1[0,2,0,2,4,6,4,6] 2204; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] 2205; AVX512DQ-BW-NEXT: movl $572662306, %ecx # imm = 0x22222222 2206; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 2207; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm5 {%k2} 2208; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA 2209; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 2210; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k3} 2211; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7] 2212; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] 2213; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm3 = zmm3[1,3,1,3,5,7,5,7] 2214; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] 2215; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1} 2216; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] 2217; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] 2218; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7] 2219; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] 2220; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} 2221; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} 2222; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) 2223; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rax) 2224; AVX512DQ-BW-NEXT: vzeroupper 2225; AVX512DQ-BW-NEXT: retq 2226; 2227; AVX512DQ-BW-FCP-LABEL: store_i8_stride8_vf16: 2228; AVX512DQ-BW-FCP: # %bb.0: 2229; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2230; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 2231; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 2232; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 2233; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 2234; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 2235; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3 2236; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 2237; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 2238; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 2239; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 2240; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 2241; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 2242; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14] 2243; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 2244; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] 2245; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 2246; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm2[0,2,0,2,4,6,4,6] 2247; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] 2248; AVX512DQ-BW-FCP-NEXT: movl $-2004318072, %ecx # imm = 0x88888888 2249; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 2250; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm7 {%k1} 2251; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 2252; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] 2253; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 2254; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,2,0,2,4,6,4,6] 2255; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] 2256; AVX512DQ-BW-FCP-NEXT: movl $572662306, %ecx # imm = 0x22222222 2257; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 2258; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k2} 2259; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA 2260; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 2261; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3} 2262; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7] 2263; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2264; AVX512DQ-BW-FCP-NEXT: vpermq %zmm5, %zmm1, %zmm5 2265; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] 2266; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7] 2267; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] 2268; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm2 {%k1} 2269; AVX512DQ-BW-FCP-NEXT: vpermq %zmm4, %zmm1, %zmm1 2270; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] 2271; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] 2272; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] 2273; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} 2274; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} 2275; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) 2276; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) 2277; AVX512DQ-BW-FCP-NEXT: vzeroupper 2278; AVX512DQ-BW-FCP-NEXT: retq 2279 %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 2280 %in.vec1 = load <16 x i8>, ptr %in.vecptr1, align 64 2281 %in.vec2 = load <16 x i8>, ptr %in.vecptr2, align 64 2282 %in.vec3 = load <16 x i8>, ptr %in.vecptr3, align 64 2283 %in.vec4 = load <16 x i8>, ptr %in.vecptr4, align 64 2284 %in.vec5 = load <16 x i8>, ptr %in.vecptr5, align 64 2285 %in.vec6 = load <16 x i8>, ptr %in.vecptr6, align 64 2286 %in.vec7 = load <16 x i8>, ptr %in.vecptr7, align 64 2287 %1 = shufflevector <16 x i8> %in.vec0, <16 x i8> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2288 %2 = shufflevector <16 x i8> %in.vec2, <16 x i8> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2289 %3 = shufflevector <16 x i8> %in.vec4, <16 x i8> %in.vec5, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2290 %4 = shufflevector <16 x i8> %in.vec6, <16 x i8> %in.vec7, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2291 %5 = shufflevector <32 x i8> %1, <32 x i8> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 2292 %6 = shufflevector <32 x i8> %3, <32 x i8> %4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 2293 %7 = shufflevector <64 x i8> %5, <64 x i8> %6, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 2294 %interleaved.vec = shufflevector <128 x i8> %7, <128 x i8> poison, <128 x i32> <i32 0, i32 16, i32 32, i32 48, i32 64, i32 80, i32 96, i32 112, i32 1, i32 17, i32 33, i32 49, i32 65, i32 81, i32 97, i32 113, i32 2, i32 18, i32 34, i32 50, i32 66, i32 82, i32 98, i32 114, i32 3, i32 19, i32 35, i32 51, i32 67, i32 83, i32 99, i32 115, i32 4, i32 20, i32 36, i32 52, i32 68, i32 84, i32 100, i32 116, i32 5, i32 21, i32 37, i32 53, i32 69, i32 85, i32 101, i32 117, i32 6, i32 22, i32 38, i32 54, i32 70, i32 86, i32 102, i32 118, i32 7, i32 23, i32 39, i32 55, i32 71, i32 87, i32 103, i32 119, i32 8, i32 24, i32 40, i32 56, i32 72, i32 88, i32 104, i32 120, i32 9, i32 25, i32 41, i32 57, i32 73, i32 89, i32 105, i32 121, i32 10, i32 26, i32 42, i32 58, i32 74, i32 90, i32 106, i32 122, i32 11, i32 27, i32 43, i32 59, i32 75, i32 91, i32 107, i32 123, i32 12, i32 28, i32 44, i32 60, i32 76, i32 92, i32 108, i32 124, i32 13, i32 29, i32 45, i32 61, i32 77, i32 93, i32 109, i32 125, i32 14, i32 30, i32 46, i32 62, i32 78, i32 94, i32 110, i32 126, i32 15, i32 31, i32 47, i32 63, i32 79, i32 95, i32 111, i32 127> 2295 store <128 x i8> %interleaved.vec, ptr %out.vec, align 64 2296 ret void 2297} 2298 2299define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { 2300; SSE-LABEL: store_i8_stride8_vf32: 2301; SSE: # %bb.0: 2302; SSE-NEXT: subq $232, %rsp 2303; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 2304; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 2305; SSE-NEXT: movdqa (%rdi), %xmm5 2306; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2307; SSE-NEXT: movdqa (%rsi), %xmm4 2308; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2309; SSE-NEXT: movdqa (%rdx), %xmm1 2310; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2311; SSE-NEXT: movdqa (%rcx), %xmm8 2312; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2313; SSE-NEXT: movdqa (%r8), %xmm13 2314; SSE-NEXT: movdqa (%r9), %xmm12 2315; SSE-NEXT: movdqa (%r10), %xmm14 2316; SSE-NEXT: movdqa (%rax), %xmm11 2317; SSE-NEXT: movdqa %xmm14, %xmm2 2318; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] 2319; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] 2320; SSE-NEXT: movdqa %xmm2, %xmm15 2321; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2322; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,1] 2323; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,65535,0] 2324; SSE-NEXT: movdqa %xmm9, %xmm6 2325; SSE-NEXT: pandn %xmm2, %xmm6 2326; SSE-NEXT: movdqa %xmm13, %xmm3 2327; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] 2328; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] 2329; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2330; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,1] 2331; SSE-NEXT: pand %xmm9, %xmm7 2332; SSE-NEXT: por %xmm6, %xmm7 2333; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 2334; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] 2335; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2336; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,1,1,3] 2337; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,0,65535,65535] 2338; SSE-NEXT: movdqa %xmm8, %xmm10 2339; SSE-NEXT: pandn %xmm6, %xmm10 2340; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 2341; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] 2342; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2343; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] 2344; SSE-NEXT: pand %xmm8, %xmm6 2345; SSE-NEXT: por %xmm10, %xmm6 2346; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] 2347; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] 2348; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 2349; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2350; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm15[0,0,2,1,4,5,6,7] 2351; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] 2352; SSE-NEXT: movdqa %xmm9, %xmm7 2353; SSE-NEXT: pandn %xmm6, %xmm7 2354; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] 2355; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] 2356; SSE-NEXT: pand %xmm9, %xmm6 2357; SSE-NEXT: por %xmm7, %xmm6 2358; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,0,2,1,4,5,6,7] 2359; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] 2360; SSE-NEXT: movdqa %xmm8, %xmm10 2361; SSE-NEXT: pandn %xmm7, %xmm10 2362; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,0,0] 2363; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] 2364; SSE-NEXT: pand %xmm8, %xmm7 2365; SSE-NEXT: por %xmm10, %xmm7 2366; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] 2367; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] 2368; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 2369; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2370; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] 2371; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm14[0,2,2,3,4,5,6,7] 2372; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2373; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] 2374; SSE-NEXT: movdqa %xmm9, %xmm6 2375; SSE-NEXT: pandn %xmm5, %xmm6 2376; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] 2377; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[2,1,3,3,4,5,6,7] 2378; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2379; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] 2380; SSE-NEXT: pand %xmm9, %xmm4 2381; SSE-NEXT: por %xmm6, %xmm4 2382; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] 2383; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 2384; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 2385; SSE-NEXT: # xmm12 = xmm12[8],mem[8],xmm12[9],mem[9],xmm12[10],mem[10],xmm12[11],mem[11],xmm12[12],mem[12],xmm12[13],mem[13],xmm12[14],mem[14],xmm12[15],mem[15] 2386; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,2,2,3,4,5,6,7] 2387; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2388; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] 2389; SSE-NEXT: movdqa %xmm8, %xmm5 2390; SSE-NEXT: pandn %xmm3, %xmm5 2391; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 2392; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 2393; SSE-NEXT: # xmm11 = xmm11[8],mem[8],xmm11[9],mem[9],xmm11[10],mem[10],xmm11[11],mem[11],xmm11[12],mem[12],xmm11[13],mem[13],xmm11[14],mem[14],xmm11[15],mem[15] 2394; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] 2395; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2396; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 2397; SSE-NEXT: pand %xmm8, %xmm1 2398; SSE-NEXT: por %xmm5, %xmm1 2399; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 2400; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2401; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2402; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,0,2,1,4,5,6,7] 2403; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] 2404; SSE-NEXT: movdqa %xmm9, %xmm3 2405; SSE-NEXT: pandn %xmm1, %xmm3 2406; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,1,1,3,4,5,6,7] 2407; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] 2408; SSE-NEXT: pand %xmm9, %xmm1 2409; SSE-NEXT: por %xmm3, %xmm1 2410; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,0,2,1,4,5,6,7] 2411; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] 2412; SSE-NEXT: movdqa %xmm8, %xmm4 2413; SSE-NEXT: pandn %xmm3, %xmm4 2414; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,0,0] 2415; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] 2416; SSE-NEXT: pand %xmm8, %xmm3 2417; SSE-NEXT: por %xmm4, %xmm3 2418; SSE-NEXT: movdqa 16(%r10), %xmm10 2419; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 2420; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 2421; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2422; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2423; SSE-NEXT: movdqa 16(%rax), %xmm1 2424; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2425; SSE-NEXT: movdqa %xmm10, %xmm5 2426; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 2427; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] 2428; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2429; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] 2430; SSE-NEXT: movdqa %xmm9, %xmm3 2431; SSE-NEXT: pandn %xmm1, %xmm3 2432; SSE-NEXT: movdqa 16(%r8), %xmm12 2433; SSE-NEXT: movdqa 16(%r9), %xmm11 2434; SSE-NEXT: movdqa %xmm12, %xmm4 2435; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] 2436; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] 2437; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill 2438; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,1] 2439; SSE-NEXT: pand %xmm9, %xmm0 2440; SSE-NEXT: por %xmm3, %xmm0 2441; SSE-NEXT: movdqa 16(%rdx), %xmm13 2442; SSE-NEXT: movdqa 16(%rcx), %xmm7 2443; SSE-NEXT: movdqa %xmm13, %xmm3 2444; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] 2445; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] 2446; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2447; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] 2448; SSE-NEXT: movdqa %xmm8, %xmm2 2449; SSE-NEXT: pandn %xmm1, %xmm2 2450; SSE-NEXT: movdqa 16(%rdi), %xmm14 2451; SSE-NEXT: movdqa 16(%rsi), %xmm6 2452; SSE-NEXT: movdqa %xmm14, %xmm1 2453; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] 2454; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[1,1,1,1] 2455; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2456; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] 2457; SSE-NEXT: pand %xmm8, %xmm15 2458; SSE-NEXT: por %xmm2, %xmm15 2459; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 2460; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] 2461; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 2462; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2463; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] 2464; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 2465; SSE-NEXT: movdqa %xmm9, %xmm2 2466; SSE-NEXT: pandn %xmm0, %xmm2 2467; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,3,4,5,6,7] 2468; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 2469; SSE-NEXT: pand %xmm9, %xmm0 2470; SSE-NEXT: por %xmm2, %xmm0 2471; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] 2472; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] 2473; SSE-NEXT: movdqa %xmm8, %xmm15 2474; SSE-NEXT: pandn %xmm2, %xmm15 2475; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] 2476; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 2477; SSE-NEXT: pand %xmm8, %xmm2 2478; SSE-NEXT: por %xmm15, %xmm2 2479; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 2480; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2481; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 2482; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2483; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 2484; SSE-NEXT: # xmm10 = xmm10[8],mem[8],xmm10[9],mem[9],xmm10[10],mem[10],xmm10[11],mem[11],xmm10[12],mem[12],xmm10[13],mem[13],xmm10[14],mem[14],xmm10[15],mem[15] 2485; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,2,2,3,4,5,6,7] 2486; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2487; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 2488; SSE-NEXT: movdqa %xmm9, %xmm2 2489; SSE-NEXT: pandn %xmm0, %xmm2 2490; SSE-NEXT: movdqa %xmm12, %xmm15 2491; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm11[8],xmm15[9],xmm11[9],xmm15[10],xmm11[10],xmm15[11],xmm11[11],xmm15[12],xmm11[12],xmm15[13],xmm11[13],xmm15[14],xmm11[14],xmm15[15],xmm11[15] 2492; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,1,3,3,4,5,6,7] 2493; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 2494; SSE-NEXT: pand %xmm9, %xmm0 2495; SSE-NEXT: por %xmm2, %xmm0 2496; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm7[8],xmm13[9],xmm7[9],xmm13[10],xmm7[10],xmm13[11],xmm7[11],xmm13[12],xmm7[12],xmm13[13],xmm7[13],xmm13[14],xmm7[14],xmm13[15],xmm7[15] 2497; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,2,2,3,4,5,6,7] 2498; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] 2499; SSE-NEXT: movdqa %xmm8, %xmm3 2500; SSE-NEXT: pandn %xmm2, %xmm3 2501; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15] 2502; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] 2503; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 2504; SSE-NEXT: pand %xmm8, %xmm1 2505; SSE-NEXT: por %xmm3, %xmm1 2506; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 2507; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2508; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2509; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2510; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,0,2,1,4,5,6,7] 2511; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 2512; SSE-NEXT: movdqa %xmm9, %xmm1 2513; SSE-NEXT: pandn %xmm0, %xmm1 2514; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,1,1,3,4,5,6,7] 2515; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 2516; SSE-NEXT: pand %xmm9, %xmm0 2517; SSE-NEXT: por %xmm1, %xmm0 2518; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,0,2,1,4,5,6,7] 2519; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] 2520; SSE-NEXT: movdqa %xmm8, %xmm2 2521; SSE-NEXT: pandn %xmm1, %xmm2 2522; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,0,0] 2523; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 2524; SSE-NEXT: pand %xmm8, %xmm1 2525; SSE-NEXT: por %xmm2, %xmm1 2526; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 2527; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2528; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2529; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2530; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2531; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7] 2532; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2533; SSE-NEXT: movdqa %xmm9, %xmm1 2534; SSE-NEXT: pandn %xmm0, %xmm1 2535; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2536; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,7,7] 2537; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2538; SSE-NEXT: pand %xmm9, %xmm0 2539; SSE-NEXT: por %xmm1, %xmm0 2540; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 2541; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,6,6,7] 2542; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 2543; SSE-NEXT: movdqa %xmm8, %xmm2 2544; SSE-NEXT: pandn %xmm1, %xmm2 2545; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 2546; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] 2547; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 2548; SSE-NEXT: pand %xmm8, %xmm1 2549; SSE-NEXT: por %xmm2, %xmm1 2550; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 2551; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2552; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2553; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2554; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] 2555; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2556; SSE-NEXT: movdqa %xmm9, %xmm1 2557; SSE-NEXT: pandn %xmm0, %xmm1 2558; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,7] 2559; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2560; SSE-NEXT: pand %xmm9, %xmm0 2561; SSE-NEXT: por %xmm1, %xmm0 2562; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,4,6,5] 2563; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 2564; SSE-NEXT: movdqa %xmm8, %xmm2 2565; SSE-NEXT: pandn %xmm1, %xmm2 2566; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,2,2] 2567; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 2568; SSE-NEXT: pand %xmm8, %xmm1 2569; SSE-NEXT: por %xmm2, %xmm1 2570; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 2571; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2572; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2573; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2574; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7] 2575; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2576; SSE-NEXT: movdqa %xmm9, %xmm2 2577; SSE-NEXT: pandn %xmm0, %xmm2 2578; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 2579; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,7,7] 2580; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2581; SSE-NEXT: pand %xmm9, %xmm0 2582; SSE-NEXT: por %xmm2, %xmm0 2583; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 2584; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,6,6,7] 2585; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] 2586; SSE-NEXT: movdqa %xmm8, %xmm3 2587; SSE-NEXT: pandn %xmm2, %xmm3 2588; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 2589; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[3,3,3,3] 2590; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 2591; SSE-NEXT: pand %xmm8, %xmm2 2592; SSE-NEXT: por %xmm3, %xmm2 2593; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 2594; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] 2595; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 2596; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,4,6,5] 2597; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2598; SSE-NEXT: movdqa %xmm9, %xmm2 2599; SSE-NEXT: pandn %xmm0, %xmm2 2600; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,5,7] 2601; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2602; SSE-NEXT: pand %xmm9, %xmm0 2603; SSE-NEXT: por %xmm2, %xmm0 2604; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,4,6,5] 2605; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] 2606; SSE-NEXT: movdqa %xmm8, %xmm4 2607; SSE-NEXT: pandn %xmm2, %xmm4 2608; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,2,2] 2609; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 2610; SSE-NEXT: pand %xmm8, %xmm2 2611; SSE-NEXT: por %xmm4, %xmm2 2612; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] 2613; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2614; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2615; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 2616; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,6,6,7] 2617; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2618; SSE-NEXT: movdqa %xmm9, %xmm4 2619; SSE-NEXT: pandn %xmm2, %xmm4 2620; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload 2621; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,7,7] 2622; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2623; SSE-NEXT: pand %xmm9, %xmm2 2624; SSE-NEXT: por %xmm4, %xmm2 2625; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 2626; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] 2627; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] 2628; SSE-NEXT: movdqa %xmm8, %xmm5 2629; SSE-NEXT: pandn %xmm4, %xmm5 2630; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 2631; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,3,3,3] 2632; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 2633; SSE-NEXT: pand %xmm8, %xmm4 2634; SSE-NEXT: por %xmm5, %xmm4 2635; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3] 2636; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] 2637; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 2638; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,4,6,5] 2639; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2640; SSE-NEXT: movdqa %xmm9, %xmm5 2641; SSE-NEXT: pandn %xmm4, %xmm5 2642; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,5,7] 2643; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2644; SSE-NEXT: pand %xmm9, %xmm4 2645; SSE-NEXT: por %xmm5, %xmm4 2646; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,4,6,5] 2647; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] 2648; SSE-NEXT: movdqa %xmm8, %xmm10 2649; SSE-NEXT: pandn %xmm5, %xmm10 2650; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[2,2,2,2] 2651; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] 2652; SSE-NEXT: pand %xmm8, %xmm5 2653; SSE-NEXT: por %xmm10, %xmm5 2654; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,3,2,3] 2655; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 2656; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] 2657; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 2658; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,6,6,7] 2659; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 2660; SSE-NEXT: movdqa %xmm9, %xmm10 2661; SSE-NEXT: pandn %xmm5, %xmm10 2662; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,6,5,7,7] 2663; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 2664; SSE-NEXT: pand %xmm9, %xmm5 2665; SSE-NEXT: por %xmm10, %xmm5 2666; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,6,6,7] 2667; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,3] 2668; SSE-NEXT: movdqa %xmm8, %xmm12 2669; SSE-NEXT: pandn %xmm10, %xmm12 2670; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[3,3,3,3] 2671; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] 2672; SSE-NEXT: pand %xmm8, %xmm10 2673; SSE-NEXT: por %xmm12, %xmm10 2674; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] 2675; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] 2676; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] 2677; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,4,6,5] 2678; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 2679; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,5,5,7] 2680; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] 2681; SSE-NEXT: pand %xmm9, %xmm11 2682; SSE-NEXT: pandn %xmm5, %xmm9 2683; SSE-NEXT: por %xmm11, %xmm9 2684; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,4,4,6,5] 2685; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] 2686; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,2,2,2] 2687; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] 2688; SSE-NEXT: pand %xmm8, %xmm6 2689; SSE-NEXT: pandn %xmm5, %xmm8 2690; SSE-NEXT: por %xmm6, %xmm8 2691; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,3,2,3] 2692; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,2,2,3] 2693; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 2694; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 2695; SSE-NEXT: movdqa %xmm6, 224(%rax) 2696; SSE-NEXT: movdqa %xmm10, 240(%rax) 2697; SSE-NEXT: movdqa %xmm4, 160(%rax) 2698; SSE-NEXT: movdqa %xmm2, 176(%rax) 2699; SSE-NEXT: movdqa %xmm0, 96(%rax) 2700; SSE-NEXT: movdqa %xmm3, 112(%rax) 2701; SSE-NEXT: movdqa %xmm1, 32(%rax) 2702; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2703; SSE-NEXT: movaps %xmm0, 48(%rax) 2704; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2705; SSE-NEXT: movaps %xmm0, 192(%rax) 2706; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2707; SSE-NEXT: movaps %xmm0, 208(%rax) 2708; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2709; SSE-NEXT: movaps %xmm0, 128(%rax) 2710; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2711; SSE-NEXT: movaps %xmm0, 144(%rax) 2712; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2713; SSE-NEXT: movaps %xmm0, 64(%rax) 2714; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2715; SSE-NEXT: movaps %xmm0, 80(%rax) 2716; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2717; SSE-NEXT: movaps %xmm0, (%rax) 2718; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2719; SSE-NEXT: movaps %xmm0, 16(%rax) 2720; SSE-NEXT: addq $232, %rsp 2721; SSE-NEXT: retq 2722; 2723; AVX-LABEL: store_i8_stride8_vf32: 2724; AVX: # %bb.0: 2725; AVX-NEXT: subq $56, %rsp 2726; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 2727; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 2728; AVX-NEXT: vmovdqa (%r10), %xmm0 2729; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2730; AVX-NEXT: vmovdqa (%rax), %xmm2 2731; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 2732; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] 2733; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] 2734; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 2735; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] 2736; AVX-NEXT: vbroadcastsd {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] 2737; AVX-NEXT: vandnps %ymm1, %ymm9, %ymm1 2738; AVX-NEXT: vmovdqa (%r9), %xmm5 2739; AVX-NEXT: vmovdqa (%r8), %xmm7 2740; AVX-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] 2741; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] 2742; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,1,3,3,4,5,6,7] 2743; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 2744; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] 2745; AVX-NEXT: vandps %ymm3, %ymm9, %ymm3 2746; AVX-NEXT: vorps %ymm1, %ymm3, %ymm8 2747; AVX-NEXT: vmovdqa (%rsi), %xmm3 2748; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2749; AVX-NEXT: vmovdqa (%rdi), %xmm1 2750; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 2751; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 2752; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[1,1,1,1] 2753; AVX-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero 2754; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm13 2755; AVX-NEXT: vmovdqa (%rcx), %xmm10 2756; AVX-NEXT: vmovdqa (%rdx), %xmm11 2757; AVX-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] 2758; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,0,2,1,4,5,6,7] 2759; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 2760; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm15[0,2,2,3,4,5,6,7] 2761; AVX-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero 2762; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm4, %ymm14 2763; AVX-NEXT: vbroadcastsd {{.*#+}} ymm6 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] 2764; AVX-NEXT: vandps %ymm6, %ymm13, %ymm13 2765; AVX-NEXT: vandnps %ymm14, %ymm6, %ymm14 2766; AVX-NEXT: vorps %ymm14, %ymm13, %ymm13 2767; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0],ymm8[1],ymm13[2],ymm8[3],ymm13[4],ymm8[5],ymm13[6],ymm8[7] 2768; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2769; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,4,6,5] 2770; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 2771; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 2772; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,4,5,5,7] 2773; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] 2774; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 2775; AVX-NEXT: vmovdqa 16(%r10), %xmm4 2776; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 2777; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0 2778; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] 2779; AVX-NEXT: vandps %ymm9, %ymm8, %ymm8 2780; AVX-NEXT: vorps %ymm0, %ymm8, %ymm0 2781; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,4,6,5] 2782; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,6,6,7] 2783; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm12 2784; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] 2785; AVX-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero 2786; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] 2787; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 2788; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm3 2789; AVX-NEXT: vmovdqa 16(%rax), %xmm8 2790; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2791; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] 2792; AVX-NEXT: vandnps %ymm12, %ymm6, %ymm12 2793; AVX-NEXT: vandps %ymm6, %ymm3, %ymm3 2794; AVX-NEXT: vorps %ymm3, %ymm12, %ymm3 2795; AVX-NEXT: vmovdqa 16(%r9), %xmm13 2796; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] 2797; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2798; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 2799; AVX-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] 2800; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2801; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5] 2802; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 2803; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2804; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] 2805; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2806; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,5,7] 2807; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] 2808; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 2809; AVX-NEXT: vmovdqa 16(%r8), %xmm7 2810; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 2811; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0 2812; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 2813; AVX-NEXT: vandps %ymm2, %ymm9, %ymm2 2814; AVX-NEXT: vmovaps %ymm9, %ymm5 2815; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 2816; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] 2817; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2818; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] 2819; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 2820; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 2821; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] 2822; AVX-NEXT: vandnps %ymm2, %ymm6, %ymm2 2823; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload 2824; AVX-NEXT: # xmm3 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] 2825; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2826; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] 2827; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 2828; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] 2829; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 2830; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 2831; AVX-NEXT: vandps %ymm6, %ymm1, %ymm1 2832; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1 2833; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 2834; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2835; AVX-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] 2836; AVX-NEXT: vmovdqa %xmm4, %xmm8 2837; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,4,6,5] 2838; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,6,6,7] 2839; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2840; AVX-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] 2841; AVX-NEXT: vmovdqa %xmm13, %xmm9 2842; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,5,5,7] 2843; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,7,7] 2844; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2845; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 2846; AVX-NEXT: vandnps %ymm0, %ymm5, %ymm0 2847; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 2848; AVX-NEXT: vandps %ymm5, %ymm1, %ymm1 2849; AVX-NEXT: vorps %ymm0, %ymm1, %ymm5 2850; AVX-NEXT: vmovdqa 16(%rcx), %xmm4 2851; AVX-NEXT: vmovdqa 16(%rdx), %xmm3 2852; AVX-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 2853; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,4,6,5] 2854; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,6,6,7] 2855; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm0 2856; AVX-NEXT: vmovdqa 16(%rsi), %xmm2 2857; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 2858; AVX-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 2859; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[2,3,2,3] 2860; AVX-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero 2861; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[3,3,3,3] 2862; AVX-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero 2863; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 2864; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] 2865; AVX-NEXT: vandnps %ymm0, %ymm6, %ymm0 2866; AVX-NEXT: vandps %ymm6, %ymm13, %ymm13 2867; AVX-NEXT: vorps %ymm0, %ymm13, %ymm0 2868; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4],ymm5[5],ymm0[6],ymm5[7] 2869; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2870; AVX-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] 2871; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,4,6,5] 2872; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,6,6,7] 2873; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 2874; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] 2875; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5,5,7] 2876; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,6,5,7,7] 2877; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 2878; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 2879; AVX-NEXT: vbroadcastsd {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] 2880; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0 2881; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] 2882; AVX-NEXT: vandps %ymm7, %ymm9, %ymm7 2883; AVX-NEXT: vorps %ymm0, %ymm7, %ymm7 2884; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] 2885; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] 2886; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7] 2887; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 2888; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 2889; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 2890; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 2891; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] 2892; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 2893; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 2894; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] 2895; AVX-NEXT: vandnps %ymm3, %ymm6, %ymm3 2896; AVX-NEXT: vandps %ymm6, %ymm1, %ymm1 2897; AVX-NEXT: vorps %ymm3, %ymm1, %ymm1 2898; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4],ymm7[5],ymm1[6],ymm7[7] 2899; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,0,2,1,4,5,6,7] 2900; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,2,2,3,4,5,6,7] 2901; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 2902; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,1,3,4,5,6,7] 2903; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] 2904; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 2905; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] 2906; AVX-NEXT: vandnps %ymm3, %ymm9, %ymm3 2907; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] 2908; AVX-NEXT: vandps %ymm4, %ymm9, %ymm4 2909; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3 2910; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 2911; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 2912; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 2913; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 2914; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,0,2,1,4,5,6,7] 2915; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 2916; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2917; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 2918; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 2919; AVX-NEXT: vandps %ymm6, %ymm2, %ymm2 2920; AVX-NEXT: vandnps %ymm0, %ymm6, %ymm0 2921; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 2922; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] 2923; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,0,2,1,4,5,6,7] 2924; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7] 2925; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 2926; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] 2927; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,1,3,3,4,5,6,7] 2928; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 2929; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] 2930; AVX-NEXT: vandnps %ymm2, %ymm9, %ymm2 2931; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] 2932; AVX-NEXT: vandps %ymm3, %ymm9, %ymm3 2933; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2 2934; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero 2935; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[1,1,1,1] 2936; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 2937; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 2938; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,0,2,1,4,5,6,7] 2939; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 2940; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[0,2,2,3,4,5,6,7] 2941; AVX-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero 2942; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 2943; AVX-NEXT: vandps %ymm6, %ymm3, %ymm3 2944; AVX-NEXT: vandnps %ymm4, %ymm6, %ymm4 2945; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 2946; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] 2947; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2948; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,0,2,1,4,5,6,7] 2949; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] 2950; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 2951; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 2952; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,1,3,4,5,6,7] 2953; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] 2954; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 2955; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] 2956; AVX-NEXT: vandnps %ymm3, %ymm9, %ymm3 2957; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] 2958; AVX-NEXT: vandps %ymm4, %ymm9, %ymm4 2959; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3 2960; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 2961; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero 2962; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] 2963; AVX-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero 2964; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 2965; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 2966; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,0,2,1,4,5,6,7] 2967; AVX-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero 2968; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] 2969; AVX-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero 2970; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 2971; AVX-NEXT: vandps %ymm6, %ymm4, %ymm4 2972; AVX-NEXT: vandnps %ymm5, %ymm6, %ymm5 2973; AVX-NEXT: vorps %ymm5, %ymm4, %ymm4 2974; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] 2975; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 2976; AVX-NEXT: vmovaps %ymm3, 64(%rax) 2977; AVX-NEXT: vmovaps %ymm2, 128(%rax) 2978; AVX-NEXT: vmovaps %ymm0, 192(%rax) 2979; AVX-NEXT: vmovaps %ymm1, 224(%rax) 2980; AVX-NEXT: vmovaps %ymm15, 160(%rax) 2981; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2982; AVX-NEXT: vmovaps %ymm0, 96(%rax) 2983; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2984; AVX-NEXT: vmovaps %ymm0, 32(%rax) 2985; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2986; AVX-NEXT: vmovaps %ymm0, (%rax) 2987; AVX-NEXT: addq $56, %rsp 2988; AVX-NEXT: vzeroupper 2989; AVX-NEXT: retq 2990; 2991; AVX2-LABEL: store_i8_stride8_vf32: 2992; AVX2: # %bb.0: 2993; AVX2-NEXT: subq $88, %rsp 2994; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 2995; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 2996; AVX2-NEXT: vmovdqa (%rsi), %xmm2 2997; AVX2-NEXT: vmovdqa (%rdi), %xmm3 2998; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 2999; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 3000; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] 3001; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 3002; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 3003; AVX2-NEXT: vmovdqa (%rcx), %xmm4 3004; AVX2-NEXT: vmovdqa (%rdx), %xmm5 3005; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 3006; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[0,0,2,1,4,5,6,7] 3007; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero 3008; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,2,2,3,4,5,6,7] 3009; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero 3010; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 3011; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] 3012; AVX2-NEXT: vmovdqa (%r10), %xmm6 3013; AVX2-NEXT: vmovdqa (%rax), %xmm7 3014; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 3015; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm13[0,0,2,1,4,5,6,7] 3016; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,2,2,3,4,5,6,7] 3017; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm9 3018; AVX2-NEXT: vmovdqa (%r9), %xmm10 3019; AVX2-NEXT: vmovdqa (%r8), %xmm11 3020; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] 3021; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm14[0,1,1,3,4,5,6,7] 3022; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,1,3,3,4,5,6,7] 3023; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm8, %ymm15 3024; AVX2-NEXT: vmovaps 16(%rsi), %xmm8 3025; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3026; AVX2-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm9[1],ymm15[2],ymm9[3],ymm15[4,5,6,7,8],ymm9[9],ymm15[10],ymm9[11],ymm15[12,13,14,15] 3027; AVX2-NEXT: vmovdqa 16(%rdi), %xmm9 3028; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,2,1,4,4,6,5] 3029; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3],ymm0[4],ymm15[5],ymm0[6],ymm15[7] 3030; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3031; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 3032; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3033; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 3034; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 3035; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3036; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,4,6,5] 3037; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7] 3038; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm1, %ymm1 3039; AVX2-NEXT: vmovdqa 16(%rcx), %xmm8 3040; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] 3041; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] 3042; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,4,6,5] 3043; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7] 3044; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1 3045; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5,5,7] 3046; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,5,7,7] 3047; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13 3048; AVX2-NEXT: vmovdqa 16(%rdx), %xmm15 3049; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5],ymm13[6],ymm1[7],ymm13[8,9,10,11,12],ymm1[13],ymm13[14],ymm1[15] 3050; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 3051; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 3052; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3053; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 3054; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3055; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 3056; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3057; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] 3058; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 3059; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 3060; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 3061; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3062; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] 3063; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,6,6,7] 3064; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 3065; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] 3066; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] 3067; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] 3068; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3069; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] 3070; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,6,6,7] 3071; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 3072; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] 3073; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3074; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,5,7] 3075; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,6,5,7,7] 3076; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 3077; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5],ymm7[6],ymm2[7],ymm7[8,9,10,11,12],ymm2[13],ymm7[14],ymm2[15] 3078; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 3079; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] 3080; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3081; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3082; AVX2-NEXT: vmovdqa %xmm9, %xmm5 3083; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] 3084; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] 3085; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3086; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,3,3,3] 3087; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero 3088; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 3089; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] 3090; AVX2-NEXT: vmovdqa %xmm8, %xmm9 3091; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,4,6,5] 3092; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,4,6,6,7] 3093; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 3094; AVX2-NEXT: vmovdqa 16(%r10), %xmm8 3095; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] 3096; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3,4],ymm13[5],ymm7[6,7,8],ymm13[9],ymm7[10,11,12],ymm13[13],ymm7[14,15] 3097; AVX2-NEXT: vmovdqa 16(%rax), %xmm4 3098; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] 3099; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,4,6,5] 3100; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,6,6,7] 3101; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm14, %ymm2 3102; AVX2-NEXT: vmovdqa 16(%r9), %xmm3 3103; AVX2-NEXT: vmovdqa 16(%r8), %xmm1 3104; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 3105; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,5,7] 3106; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm14[0,1,2,3,6,5,7,7] 3107; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0 3108; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7],ymm0[8,9,10,11,12],ymm2[13],ymm0[14],ymm2[15] 3109; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 3110; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4],ymm0[5],ymm7[6],ymm0[7] 3111; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3112; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] 3113; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 3114; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3115; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[3,3,3,3] 3116; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero 3117; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0 3118; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] 3119; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,4,6,5] 3120; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm12[0,1,2,3,4,6,6,7] 3121; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm15, %ymm7 3122; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] 3123; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7,8],ymm7[9],ymm0[10,11,12],ymm7[13],ymm0[14,15] 3124; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 3125; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,4,6,5] 3126; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,6,7] 3127; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 3128; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] 3129; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] 3130; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,6,5,7,7] 3131; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 3132; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6],ymm5[7],ymm3[8,9,10,11,12],ymm5[13],ymm3[14],ymm5[15] 3133; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 3134; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] 3135; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 3136; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 3137; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 3138; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 3139; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,0,2,1,4,5,6,7] 3140; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 3141; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,2,2,3,4,5,6,7] 3142; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero 3143; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 3144; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] 3145; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,0,2,1,4,5,6,7] 3146; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] 3147; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 3148; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,1,3,4,5,6,7] 3149; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] 3150; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 3151; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4,5,6,7,8],ymm3[9],ymm1[10],ymm3[11],ymm1[12,13,14,15] 3152; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] 3153; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] 3154; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero 3155; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] 3156; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 3157; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 3158; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,0,2,1,4,5,6,7] 3159; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 3160; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,2,2,3,4,5,6,7] 3161; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 3162; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 3163; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] 3164; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,0,2,1,4,5,6,7] 3165; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,2,2,3,4,5,6,7] 3166; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 3167; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,1,3,4,5,6,7] 3168; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[2,1,3,3,4,5,6,7] 3169; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 3170; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5,6,7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13,14,15] 3171; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] 3172; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] 3173; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 3174; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 3175; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] 3176; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 3177; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 3178; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3179; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7] 3180; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 3181; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] 3182; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero 3183; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 3184; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] 3185; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3186; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7] 3187; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] 3188; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 3189; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3190; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7] 3191; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7] 3192; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 3193; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5,6,7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13,14,15] 3194; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] 3195; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] 3196; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 3197; AVX2-NEXT: vmovdqa %ymm3, 64(%rax) 3198; AVX2-NEXT: vmovdqa %ymm2, 128(%rax) 3199; AVX2-NEXT: vmovdqa %ymm1, 192(%rax) 3200; AVX2-NEXT: vmovdqa %ymm0, 224(%rax) 3201; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3202; AVX2-NEXT: vmovaps %ymm0, 160(%rax) 3203; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3204; AVX2-NEXT: vmovaps %ymm0, 96(%rax) 3205; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3206; AVX2-NEXT: vmovaps %ymm0, 32(%rax) 3207; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3208; AVX2-NEXT: vmovaps %ymm0, (%rax) 3209; AVX2-NEXT: addq $88, %rsp 3210; AVX2-NEXT: vzeroupper 3211; AVX2-NEXT: retq 3212; 3213; AVX2-FP-LABEL: store_i8_stride8_vf32: 3214; AVX2-FP: # %bb.0: 3215; AVX2-FP-NEXT: subq $72, %rsp 3216; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3217; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 3218; AVX2-FP-NEXT: vmovdqa (%r10), %xmm5 3219; AVX2-FP-NEXT: vmovdqa (%rax), %xmm6 3220; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 3221; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm8 3222; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] 3223; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1 3224; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 3225; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 3226; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm9 3227; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] 3228; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] 3229; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm7 3230; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm3 3231; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm10 3232; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] 3233; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] 3234; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero 3235; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm13 3236; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm15 3237; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] 3238; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm0 3239; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] 3240; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15] 3241; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7] 3242; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3243; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] 3244; AVX2-FP-NEXT: vpshufb %ymm14, %ymm8, %ymm8 3245; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] 3246; AVX2-FP-NEXT: vpshufb %ymm13, %ymm9, %ymm9 3247; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] 3248; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] 3249; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 3250; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9 3251; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm12 = [2312,2826,3340,3854] 3252; AVX2-FP-NEXT: vpshufb %ymm12, %ymm9, %ymm9 3253; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] 3254; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] 3255; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 3256; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] 3257; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 3258; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 3259; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3260; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 3261; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3262; AVX2-FP-NEXT: vpshufb %ymm14, %ymm0, %ymm0 3263; AVX2-FP-NEXT: vpshufb %ymm13, %ymm1, %ymm1 3264; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] 3265; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15] 3266; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 3267; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3268; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] 3269; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm1 3270; AVX2-FP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 3271; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 3272; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] 3273; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 3274; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3275; AVX2-FP-NEXT: vmovdqa 16(%r10), %xmm8 3276; AVX2-FP-NEXT: vmovdqa 16(%rax), %xmm6 3277; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] 3278; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm12 3279; AVX2-FP-NEXT: vmovdqa 16(%r9), %xmm7 3280; AVX2-FP-NEXT: vmovdqa 16(%r8), %xmm5 3281; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] 3282; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm10 3283; AVX2-FP-NEXT: vpshufb %ymm14, %ymm12, %ymm3 3284; AVX2-FP-NEXT: vpshufb %ymm13, %ymm10, %ymm4 3285; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] 3286; AVX2-FP-NEXT: vmovdqa 16(%rcx), %xmm4 3287; AVX2-FP-NEXT: vmovdqa 16(%rdx), %xmm2 3288; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 3289; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 3290; AVX2-FP-NEXT: vmovdqa 16(%rsi), %xmm1 3291; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm0 3292; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 3293; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm14 3294; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] 3295; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm9[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] 3296; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15] 3297; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4],ymm15[5],ymm13[6],ymm15[7] 3298; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] 3299; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] 3300; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 3301; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] 3302; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 3303; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] 3304; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7],ymm8[8,9,10],ymm7[11],ymm8[12,13,14],ymm7[15] 3305; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 3306; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 3307; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 3308; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] 3309; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 3310; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] 3311; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15] 3312; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7] 3313; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] 3314; AVX2-FP-NEXT: vpshufb %ymm7, %ymm6, %ymm4 3315; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] 3316; AVX2-FP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 3317; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] 3318; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm8 = [1284,1798] 3319; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm5 3320; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 3321; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 3322; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] 3323; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 3324; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] 3325; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] 3326; AVX2-FP-NEXT: vpshufb %ymm7, %ymm12, %ymm2 3327; AVX2-FP-NEXT: vpshufb %ymm6, %ymm10, %ymm4 3328; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] 3329; AVX2-FP-NEXT: vpshufb %xmm8, %xmm3, %xmm4 3330; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 3331; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 3332; AVX2-FP-NEXT: vpshufb %ymm5, %ymm9, %ymm4 3333; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] 3334; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] 3335; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 3336; AVX2-FP-NEXT: vpshufb %ymm7, %ymm3, %ymm3 3337; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3338; AVX2-FP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 3339; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] 3340; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3341; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 3342; AVX2-FP-NEXT: vpshufb %xmm8, %xmm11, %xmm5 3343; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero 3344; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 3345; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] 3346; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] 3347; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3348; AVX2-FP-NEXT: vmovdqa %ymm3, 64(%rax) 3349; AVX2-FP-NEXT: vmovdqa %ymm2, 128(%rax) 3350; AVX2-FP-NEXT: vmovdqa %ymm1, 192(%rax) 3351; AVX2-FP-NEXT: vmovdqa %ymm0, 224(%rax) 3352; AVX2-FP-NEXT: vmovdqa %ymm15, 160(%rax) 3353; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3354; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rax) 3355; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3356; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) 3357; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3358; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) 3359; AVX2-FP-NEXT: addq $72, %rsp 3360; AVX2-FP-NEXT: vzeroupper 3361; AVX2-FP-NEXT: retq 3362; 3363; AVX2-FCP-LABEL: store_i8_stride8_vf32: 3364; AVX2-FCP: # %bb.0: 3365; AVX2-FCP-NEXT: subq $72, %rsp 3366; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3367; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 3368; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm5 3369; AVX2-FCP-NEXT: vmovdqa (%rax), %xmm6 3370; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 3371; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm8 3372; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] 3373; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm1 3374; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 3375; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 3376; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm9 3377; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] 3378; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] 3379; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm7 3380; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm3 3381; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm10 3382; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] 3383; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] 3384; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero 3385; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm13 3386; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm15 3387; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] 3388; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm0 3389; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] 3390; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15] 3391; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7] 3392; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3393; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] 3394; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm8 3395; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] 3396; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm9, %ymm9 3397; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] 3398; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] 3399; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 3400; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9 3401; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm12 = [2312,2826,3340,3854] 3402; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm9 3403; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] 3404; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] 3405; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 3406; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] 3407; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 3408; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 3409; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3410; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 3411; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3412; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm0 3413; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm1 3414; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] 3415; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15] 3416; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 3417; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3418; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] 3419; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm1 3420; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 3421; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 3422; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] 3423; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 3424; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3425; AVX2-FCP-NEXT: vmovdqa 16(%r10), %xmm8 3426; AVX2-FCP-NEXT: vmovdqa 16(%rax), %xmm6 3427; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] 3428; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm12 3429; AVX2-FCP-NEXT: vmovdqa 16(%r9), %xmm7 3430; AVX2-FCP-NEXT: vmovdqa 16(%r8), %xmm5 3431; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] 3432; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm10 3433; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm12, %ymm3 3434; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm4 3435; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] 3436; AVX2-FCP-NEXT: vmovdqa 16(%rcx), %xmm4 3437; AVX2-FCP-NEXT: vmovdqa 16(%rdx), %xmm2 3438; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 3439; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 3440; AVX2-FCP-NEXT: vmovdqa 16(%rsi), %xmm1 3441; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm0 3442; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 3443; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm14 3444; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] 3445; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm9[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] 3446; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15] 3447; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4],ymm15[5],ymm13[6],ymm15[7] 3448; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] 3449; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] 3450; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 3451; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] 3452; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 3453; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] 3454; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7],ymm8[8,9,10],ymm7[11],ymm8[12,13,14],ymm7[15] 3455; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 3456; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 3457; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 3458; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] 3459; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 3460; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] 3461; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15] 3462; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7] 3463; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] 3464; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm4 3465; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] 3466; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 3467; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] 3468; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm8 = [1284,1798] 3469; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm5 3470; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 3471; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 3472; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] 3473; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 3474; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] 3475; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] 3476; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm2 3477; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm4 3478; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] 3479; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm4 3480; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 3481; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 3482; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm9, %ymm4 3483; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] 3484; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] 3485; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 3486; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm3 3487; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3488; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 3489; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] 3490; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3491; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 3492; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm5 3493; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero 3494; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 3495; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] 3496; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] 3497; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3498; AVX2-FCP-NEXT: vmovdqa %ymm3, 64(%rax) 3499; AVX2-FCP-NEXT: vmovdqa %ymm2, 128(%rax) 3500; AVX2-FCP-NEXT: vmovdqa %ymm1, 192(%rax) 3501; AVX2-FCP-NEXT: vmovdqa %ymm0, 224(%rax) 3502; AVX2-FCP-NEXT: vmovdqa %ymm15, 160(%rax) 3503; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3504; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rax) 3505; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3506; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) 3507; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3508; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) 3509; AVX2-FCP-NEXT: addq $72, %rsp 3510; AVX2-FCP-NEXT: vzeroupper 3511; AVX2-FCP-NEXT: retq 3512; 3513; AVX512-LABEL: store_i8_stride8_vf32: 3514; AVX512: # %bb.0: 3515; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 3516; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 3517; AVX512-NEXT: vmovdqa (%r10), %xmm1 3518; AVX512-NEXT: vmovdqa 16(%r10), %xmm11 3519; AVX512-NEXT: vmovdqa (%rax), %xmm2 3520; AVX512-NEXT: vmovdqa 16(%rax), %xmm12 3521; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 3522; AVX512-NEXT: vmovdqa64 %xmm2, %xmm21 3523; AVX512-NEXT: vmovdqa64 %xmm1, %xmm22 3524; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 3525; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] 3526; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 3527; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 3528; AVX512-NEXT: vmovdqa (%r9), %xmm3 3529; AVX512-NEXT: vmovdqa 16(%r9), %xmm13 3530; AVX512-NEXT: vmovdqa (%r8), %xmm4 3531; AVX512-NEXT: vmovdqa 16(%r8), %xmm14 3532; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] 3533; AVX512-NEXT: vmovdqa64 %xmm4, %xmm23 3534; AVX512-NEXT: vmovdqa64 %xmm3, %xmm24 3535; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] 3536; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,6,5,7,7] 3537; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 3538; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] 3539; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7],ymm5[8,9,10],ymm1[11],ymm5[12,13,14],ymm1[15] 3540; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7] 3541; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3542; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 3543; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] 3544; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] 3545; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] 3546; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 3547; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] 3548; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] 3549; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 3550; AVX512-NEXT: vmovdqa (%rsi), %xmm1 3551; AVX512-NEXT: vmovdqa (%rdi), %xmm7 3552; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] 3553; AVX512-NEXT: vmovdqa64 %xmm1, %xmm25 3554; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 3555; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 3556; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 3557; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 3558; AVX512-NEXT: vmovdqa (%rcx), %xmm8 3559; AVX512-NEXT: vmovdqa (%rdx), %xmm9 3560; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] 3561; AVX512-NEXT: vpshufhw {{.*#+}} xmm15 = xmm10[0,1,2,3,4,4,6,5] 3562; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,6,6,7] 3563; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm15, %ymm3 3564; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 3565; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,3,3,6,5,7,7] 3566; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] 3567; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3568; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 3569; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3570; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 3571; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,0,2,1,4,5,6,7] 3572; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 3573; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] 3574; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 3575; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 3576; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] 3577; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 3578; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] 3579; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 3580; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] 3581; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 3582; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 3583; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,5,7] 3584; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,6,5,7,7] 3585; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm10 3586; AVX512-NEXT: vmovdqa 16(%rcx), %xmm5 3587; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7] 3588; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3589; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 3590; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 3591; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] 3592; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3],ymm10[4,5,6],ymm1[7],ymm10[8,9,10],ymm1[11],ymm10[12,13,14],ymm1[15] 3593; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,1,1,3,4,5,6,7] 3594; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] 3595; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 3596; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] 3597; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] 3598; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] 3599; AVX512-NEXT: vmovdqa 16(%rdx), %xmm10 3600; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 3601; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] 3602; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] 3603; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,6,6,7] 3604; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm15 3605; AVX512-NEXT: vmovdqa 16(%rsi), %xmm4 3606; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2 3607; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 3608; AVX512-NEXT: vpshufd {{.*#+}} xmm19 = xmm0[2,3,2,3] 3609; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero 3610; AVX512-NEXT: vpshufd {{.*#+}} xmm20 = xmm0[3,3,3,3] 3611; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero 3612; AVX512-NEXT: vinserti32x4 $1, %xmm20, %ymm19, %ymm1 3613; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] 3614; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7,8],ymm15[9],ymm1[10,11,12],ymm15[13],ymm1[14,15] 3615; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3616; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 3617; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] 3618; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero 3619; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] 3620; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 3621; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3622; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 3623; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 3624; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] 3625; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm15 3626; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] 3627; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 3628; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] 3629; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 3630; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] 3631; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] 3632; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,6,5,7,7] 3633; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm6, %ymm6 3634; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,0,2,1,4,5,6,7] 3635; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3636; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm11, %ymm0 3637; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 3638; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] 3639; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] 3640; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] 3641; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] 3642; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 3643; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] 3644; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] 3645; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] 3646; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 3647; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 3648; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 3649; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 3650; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 3651; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 3652; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] 3653; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] 3654; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] 3655; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 3656; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 3657; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] 3658; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] 3659; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3660; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 3661; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3662; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] 3663; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 3664; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 3665; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] 3666; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 3667; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 3668; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] 3669; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 3670; AVX512-NEXT: vmovdqa64 %xmm21, %xmm1 3671; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2 3672; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 3673; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] 3674; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7] 3675; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 3676; AVX512-NEXT: vmovdqa64 %xmm23, %xmm3 3677; AVX512-NEXT: vmovdqa64 %xmm24, %xmm4 3678; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 3679; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] 3680; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7] 3681; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 3682; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] 3683; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 3684; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 3685; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 3686; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] 3687; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] 3688; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] 3689; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] 3690; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 3691; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] 3692; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] 3693; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] 3694; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 3695; AVX512-NEXT: vmovdqa64 %xmm25, %xmm2 3696; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] 3697; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] 3698; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 3699; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] 3700; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 3701; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 3702; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] 3703; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,6,7] 3704; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 3705; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 3706; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,1,3,3,6,5,7,7] 3707; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] 3708; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 3709; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 3710; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 3711; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] 3712; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero 3713; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 3714; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7] 3715; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 3716; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 3717; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] 3718; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 3719; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 3720; AVX512-NEXT: movw $-21846, %cx # imm = 0xAAAA 3721; AVX512-NEXT: kmovw %ecx, %k1 3722; AVX512-NEXT: vmovdqa32 %zmm16, %zmm18 {%k1} 3723; AVX512-NEXT: vmovdqa32 %zmm17, %zmm15 {%k1} 3724; AVX512-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} 3725; AVX512-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} 3726; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) 3727; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) 3728; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rax) 3729; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rax) 3730; AVX512-NEXT: vzeroupper 3731; AVX512-NEXT: retq 3732; 3733; AVX512-FCP-LABEL: store_i8_stride8_vf32: 3734; AVX512-FCP: # %bb.0: 3735; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3736; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 3737; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm1 3738; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm2 3739; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 3740; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 3741; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm22 3742; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 3743; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] 3744; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3 3745; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm4 3746; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] 3747; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm24 3748; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm25 3749; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 3750; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] 3751; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm3 3752; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm20 3753; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] 3754; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] 3755; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 3756; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 3757; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] 3758; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 3759; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm27 3760; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] 3761; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 3762; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 3763; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm10 3764; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] 3765; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm30 3766; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 3767; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm12 3768; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm13 3769; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] 3770; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 3771; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854] 3772; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 3773; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] 3774; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm3 3775; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm28 3776; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] 3777; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] 3778; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 3779; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm29 3780; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 3781; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} xmm9 = [1284,1798] 3782; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 3783; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 3784; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] 3785; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 3786; AVX512-FCP-NEXT: vmovdqa 16(%r10), %xmm8 3787; AVX512-FCP-NEXT: vmovdqa 16(%rax), %xmm11 3788; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] 3789; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 3790; AVX512-FCP-NEXT: vmovdqa 16(%r9), %xmm7 3791; AVX512-FCP-NEXT: vmovdqa 16(%r8), %xmm6 3792; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] 3793; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 3794; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,10,11,10,11,10,11,0,1,2,3,12,13,12,13,12,13,10,11,14,15,14,15] 3795; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm3 3796; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm2 3797; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15] 3798; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,0,1,8,9,10,11,2,3,2,3,4,5,2,3,4,5,4,5,8,9,10,11,6,7,6,7] 3799; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 3800; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 3801; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] 3802; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 3803; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm5 3804; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 3805; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 3806; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 3807; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm1 3808; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm16 3809; AVX512-FCP-NEXT: vmovdqa 16(%rcx), %xmm3 3810; AVX512-FCP-NEXT: vmovdqa 16(%rdx), %xmm2 3811; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 3812; AVX512-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 3813; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] 3814; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] 3815; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3816; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 3817; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm31 3818; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm14, %ymm0 3819; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] 3820; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7,8],ymm14[9],ymm0[10,11,12],ymm14[13],ymm0[14,15] 3821; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 3822; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] 3823; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] 3824; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 3825; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 3826; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] 3827; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm6 3828; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm11 3829; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm7 3830; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] 3831; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm7 3832; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 3833; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm14 3834; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm1 3835; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] 3836; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm20 3837; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 3838; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] 3839; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 3840; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 3841; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm15 3842; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 3843; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm6 3844; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm4 3845; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] 3846; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm9 3847; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1 3848; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 3849; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm0 3850; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 3851; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 3852; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] 3853; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 3854; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 3855; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 3856; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 3857; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm3 3858; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 3859; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 3860; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 3861; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm4 3862; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 3863; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm5 3864; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] 3865; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 3866; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 3867; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] 3868; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 3869; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm3 3870; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] 3871; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] 3872; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 3873; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm5 3874; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 3875; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm6 3876; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] 3877; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 3878; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm6 3879; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 3880; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 3881; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7,8],ymm3[9],ymm4[10,11,12],ymm3[13],ymm4[14,15] 3882; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 3883; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3884; AVX512-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA 3885; AVX512-FCP-NEXT: kmovw %ecx, %k1 3886; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm19 {%k1} 3887; AVX512-FCP-NEXT: vmovdqa32 %zmm17, %zmm18 {%k1} 3888; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm1 {%k1} 3889; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} 3890; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) 3891; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) 3892; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 128(%rax) 3893; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax) 3894; AVX512-FCP-NEXT: vzeroupper 3895; AVX512-FCP-NEXT: retq 3896; 3897; AVX512DQ-LABEL: store_i8_stride8_vf32: 3898; AVX512DQ: # %bb.0: 3899; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 3900; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 3901; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 3902; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm10 3903; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 3904; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm11 3905; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 3906; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm19 3907; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm20 3908; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 3909; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 3910; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] 3911; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 3912; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 3913; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm1 3914; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm12 3915; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 3916; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm13 3917; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 3918; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm21 3919; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm22 3920; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,4,6,5] 3921; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,6,6,7] 3922; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 3923; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] 3924; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7,8],ymm7[9],ymm5[10,11,12],ymm7[13],ymm5[14,15] 3925; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3926; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 3927; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3928; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 3929; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] 3930; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero 3931; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] 3932; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero 3933; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 3934; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] 3935; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm16 3936; AVX512DQ-NEXT: vmovdqa (%r10), %xmm5 3937; AVX512DQ-NEXT: vmovdqa (%rax), %xmm6 3938; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] 3939; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,4,6,5] 3940; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,6,6,7] 3941; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm9 3942; AVX512DQ-NEXT: vmovdqa (%r9), %xmm7 3943; AVX512DQ-NEXT: vmovdqa (%r8), %xmm8 3944; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] 3945; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5,5,7] 3946; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,7,7] 3947; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm15, %ymm1 3948; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] 3949; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 3950; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3],ymm1[4,5,6],ymm9[7],ymm1[8,9,10],ymm9[11],ymm1[12,13,14],ymm9[15] 3951; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,0,2,1,4,5,6,7] 3952; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3953; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 3954; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] 3955; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[0,1,1,3,4,5,6,7] 3956; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,1,3,3,4,5,6,7] 3957; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm9, %ymm9 3958; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5] 3959; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5,6],ymm0[7],ymm9[8,9,10],ymm0[11],ymm9[12,13,14],ymm0[15] 3960; AVX512DQ-NEXT: movw $-21846, %cx # imm = 0xAAAA 3961; AVX512DQ-NEXT: kmovw %ecx, %k1 3962; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm16 {%k1} 3963; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] 3964; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 3965; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 3966; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[3,3,3,3] 3967; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero 3968; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] 3969; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,4,6,5] 3970; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,6,6,7] 3971; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm14, %ymm2 3972; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 3973; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] 3974; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] 3975; AVX512DQ-NEXT: vmovdqa 16(%r10), %xmm14 3976; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3977; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 3978; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3979; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[0,0,2,1,4,5,6,7] 3980; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero 3981; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 3982; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,2,2,3,4,5,6,7] 3983; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 3984; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2 3985; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] 3986; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm15 3987; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 3988; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] 3989; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] 3990; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] 3991; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm18 3992; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm3 3993; AVX512DQ-NEXT: vmovdqa 16(%r8), %xmm9 3994; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] 3995; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,7] 3996; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,6,5,7,7] 3997; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 3998; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,0,2,1,4,5,6,7] 3999; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 4000; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 4001; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,1,1,3,4,5,6,7] 4002; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] 4003; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 4004; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,2,2,3,4,6,6,7] 4005; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 4006; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6],ymm4[7],ymm2[8,9,10],ymm4[11],ymm2[12,13,14],ymm4[15] 4007; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] 4008; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] 4009; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7],ymm0[8,9,10],ymm1[11],ymm0[12,13,14],ymm1[15] 4010; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm17 {%k1} 4011; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] 4012; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 4013; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 4014; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 4015; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 4016; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] 4017; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,4,6,5] 4018; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,6,6,7] 4019; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 4020; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 4021; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,1,3,3,6,5,7,7] 4022; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] 4023; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 4024; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 4025; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 4026; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,0,2,1,4,5,6,7] 4027; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero 4028; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 4029; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] 4030; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 4031; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2 4032; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] 4033; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm10 4034; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] 4035; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 4036; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] 4037; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 4038; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] 4039; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] 4040; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7] 4041; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 4042; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,0,2,1,4,5,6,7] 4043; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 4044; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 4045; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 4046; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 4047; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] 4048; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] 4049; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] 4050; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 4051; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] 4052; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] 4053; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] 4054; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm10 {%k1} 4055; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm0 4056; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm1 4057; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 4058; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 4059; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 4060; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 4061; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 4062; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm3 4063; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm4 4064; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 4065; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] 4066; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,6,6,7] 4067; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm4 4068; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 4069; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] 4070; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] 4071; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 4072; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 4073; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 4074; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] 4075; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 4076; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 4077; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] 4078; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 4079; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 4080; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] 4081; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4082; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 4083; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] 4084; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7] 4085; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 4086; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] 4087; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] 4088; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7] 4089; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 4090; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] 4091; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 4092; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 4093; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 4094; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] 4095; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] 4096; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] 4097; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] 4098; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 4099; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] 4100; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] 4101; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] 4102; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1} 4103; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 4104; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) 4105; AVX512DQ-NEXT: vmovdqa64 %zmm10, 192(%rax) 4106; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%rax) 4107; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rax) 4108; AVX512DQ-NEXT: vzeroupper 4109; AVX512DQ-NEXT: retq 4110; 4111; AVX512DQ-FCP-LABEL: store_i8_stride8_vf32: 4112; AVX512DQ-FCP: # %bb.0: 4113; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4114; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 4115; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 4116; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 4117; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 4118; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm19 4119; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm20 4120; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 4121; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] 4122; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm1 4123; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm18 4124; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3 4125; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 4126; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] 4127; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm21 4128; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 4129; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 4130; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm11 = [2312,2826,3340,3854] 4131; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 4132; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15] 4133; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] 4134; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 4135; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23 4136; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} xmm8 = [1284,1798] 4137; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm3 4138; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 4139; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 4140; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] 4141; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 4142; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm1 4143; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm14 4144; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15] 4145; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm28 4146; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 4147; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] 4148; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm1 4149; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm24 4150; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm12 4151; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm13 4152; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] 4153; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 4154; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] 4155; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm3 4156; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm25 4157; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] 4158; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] 4159; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 4160; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 4161; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] 4162; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 4163; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm27 4164; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] 4165; AVX512DQ-FCP-NEXT: movw $-21846, %r11w # imm = 0xAAAA 4166; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1 4167; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm17 {%k1} 4168; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm10 4169; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm9 4170; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] 4171; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm0 4172; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm3 4173; AVX512DQ-FCP-NEXT: vmovdqa 16(%rcx), %xmm7 4174; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdx), %xmm6 4175; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] 4176; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 4177; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] 4178; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] 4179; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 4180; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm2 4181; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm29 4182; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 4183; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] 4184; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] 4185; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm16 4186; AVX512DQ-FCP-NEXT: vmovdqa 16(%r10), %xmm8 4187; AVX512DQ-FCP-NEXT: vmovdqa 16(%rax), %xmm4 4188; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] 4189; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm0 4190; AVX512DQ-FCP-NEXT: vmovdqa 16(%r9), %xmm5 4191; AVX512DQ-FCP-NEXT: vmovdqa 16(%r8), %xmm3 4192; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 4193; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 4194; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,10,11,10,11,10,11,0,1,2,3,12,13,12,13,12,13,10,11,14,15,14,15] 4195; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm1 4196; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm2 4197; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] 4198; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,0,1,8,9,10,11,2,3,2,3,4,5,2,3,4,5,4,5,8,9,10,11,6,7,6,7] 4199; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 4200; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm2 4201; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] 4202; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm16 {%k1} 4203; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] 4204; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] 4205; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 4206; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 4207; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm10 4208; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 4209; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm11 4210; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm6 4211; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7,8],ymm6[9],ymm2[10,11,12],ymm6[13],ymm2[14,15] 4212; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm9 4213; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 4214; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 4215; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm15 4216; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 4217; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 4218; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] 4219; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 4220; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 4221; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] 4222; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 4223; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 4224; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm5 4225; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm3 4226; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm6 4227; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm4 4228; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] 4229; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm8 4230; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 4231; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm7 4232; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 4233; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] 4234; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm3, %zmm1, %zmm0 {%k1} 4235; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm1 4236; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 4237; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 4238; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 4239; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 4240; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 4241; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 4242; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 4243; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 4244; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm4 4245; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] 4246; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1 4247; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm4 4248; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 4249; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 4250; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] 4251; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 4252; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm2 4253; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] 4254; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] 4255; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 4256; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm4 4257; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 4258; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm5 4259; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] 4260; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 4261; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm3 4262; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] 4263; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm4, %zmm2, %zmm1 {%k1} 4264; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4265; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) 4266; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) 4267; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 128(%rax) 4268; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) 4269; AVX512DQ-FCP-NEXT: vzeroupper 4270; AVX512DQ-FCP-NEXT: retq 4271; 4272; AVX512BW-LABEL: store_i8_stride8_vf32: 4273; AVX512BW: # %bb.0: 4274; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 4275; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 4276; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 4277; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 4278; AVX512BW-NEXT: vmovdqa 16(%rsi), %xmm11 4279; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 4280; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm12 4281; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 4282; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 4283; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 4284; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 4285; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 4286; AVX512BW-NEXT: vmovdqa (%rcx), %xmm3 4287; AVX512BW-NEXT: vmovdqa 16(%rcx), %xmm13 4288; AVX512BW-NEXT: vmovdqa (%rdx), %xmm4 4289; AVX512BW-NEXT: vmovdqa 16(%rdx), %xmm14 4290; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] 4291; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] 4292; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero 4293; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,2,2,3,4,5,6,7] 4294; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero 4295; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 4296; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7,8],ymm7[9],ymm5[10,11,12],ymm7[13],ymm5[14,15] 4297; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] 4298; AVX512BW-NEXT: vpermt2w %ymm6, %ymm22, %ymm0 4299; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 4300; AVX512BW-NEXT: vmovdqa (%r11), %xmm6 4301; AVX512BW-NEXT: vmovdqa 16(%r11), %xmm15 4302; AVX512BW-NEXT: vmovdqa (%r10), %xmm7 4303; AVX512BW-NEXT: vmovdqa64 16(%r10), %xmm17 4304; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] 4305; AVX512BW-NEXT: vmovdqa (%r9), %xmm8 4306; AVX512BW-NEXT: vmovdqa64 16(%r9), %xmm18 4307; AVX512BW-NEXT: vmovdqa (%r8), %xmm9 4308; AVX512BW-NEXT: vmovdqa64 16(%r8), %xmm19 4309; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] 4310; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] 4311; AVX512BW-NEXT: vpermt2w %zmm16, %zmm24, %zmm20 4312; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA 4313; AVX512BW-NEXT: kmovd %ecx, %k1 4314; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm0 {%k1} 4315; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] 4316; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero 4317; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm16[1,1,1,1] 4318; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero 4319; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm5 4320; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 4321; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm20[0,0,2,1,4,5,6,7] 4322; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero 4323; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm23 = xmm20[0,2,2,3,4,5,6,7] 4324; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm23 = xmm23[0],zero,xmm23[1],zero 4325; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm21, %ymm10 4326; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7,8],ymm10[9],ymm5[10,11,12],ymm10[13],ymm5[14,15] 4327; AVX512BW-NEXT: vpermt2w %ymm20, %ymm22, %ymm16 4328; AVX512BW-NEXT: vinserti64x4 $1, %ymm16, %zmm5, %zmm16 4329; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] 4330; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3],xmm19[4],xmm18[4],xmm19[5],xmm18[5],xmm19[6],xmm18[6],xmm19[7],xmm18[7] 4331; AVX512BW-NEXT: vpermt2w %zmm5, %zmm24, %zmm10 4332; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm16 {%k1} 4333; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] 4334; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero 4335; AVX512BW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[1,1,1,1] 4336; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero 4337; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 4338; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] 4339; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,0,2,1,4,5,6,7] 4340; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero 4341; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,2,2,3,4,5,6,7] 4342; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero 4343; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 4344; AVX512BW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15] 4345; AVX512BW-NEXT: vpermt2w %ymm11, %ymm22, %ymm5 4346; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 4347; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] 4348; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] 4349; AVX512BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm11 4350; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} 4351; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 4352; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 4353; AVX512BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] 4354; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero 4355; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm2 4356; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 4357; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] 4358; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 4359; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,2,2,3,4,5,6,7] 4360; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero 4361; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 4362; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] 4363; AVX512BW-NEXT: vpermt2w %ymm3, %ymm22, %ymm1 4364; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 4365; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 4366; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 4367; AVX512BW-NEXT: vpermt2w %zmm2, %zmm24, %zmm3 4368; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} 4369; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) 4370; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) 4371; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rax) 4372; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) 4373; AVX512BW-NEXT: vzeroupper 4374; AVX512BW-NEXT: retq 4375; 4376; AVX512BW-FCP-LABEL: store_i8_stride8_vf32: 4377; AVX512BW-FCP: # %bb.0: 4378; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4379; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 4380; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 4381; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm0 4382; AVX512BW-FCP-NEXT: vmovdqa 16(%r11), %xmm1 4383; AVX512BW-FCP-NEXT: vmovdqa (%r10), %xmm2 4384; AVX512BW-FCP-NEXT: vmovdqa 16(%r10), %xmm3 4385; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 4386; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm5 4387; AVX512BW-FCP-NEXT: vmovdqa 16(%r9), %xmm6 4388; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm7 4389; AVX512BW-FCP-NEXT: vmovdqa 16(%r8), %xmm8 4390; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] 4391; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] 4392; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm9 4393; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm4 4394; AVX512BW-FCP-NEXT: vmovdqa 16(%rcx), %xmm11 4395; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 4396; AVX512BW-FCP-NEXT: vmovdqa 16(%rdx), %xmm13 4397; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15] 4398; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm15 4399; AVX512BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm16 4400; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm17 4401; AVX512BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm18 4402; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] 4403; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] 4404; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm20, %zmm19 4405; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA 4406; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 4407; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm19 {%k1} 4408; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 4409; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] 4410; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm10, %zmm14 4411; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] 4412; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] 4413; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 4414; AVX512BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm21 {%k1} 4415; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 4416; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] 4417; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm10, %zmm3 4418; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] 4419; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] 4420; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm20, %zmm6 4421; AVX512BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm6 {%k1} 4422; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 4423; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] 4424; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm1 4425; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] 4426; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] 4427; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm20, %zmm2 4428; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} 4429; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) 4430; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) 4431; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%rax) 4432; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax) 4433; AVX512BW-FCP-NEXT: vzeroupper 4434; AVX512BW-FCP-NEXT: retq 4435; 4436; AVX512DQ-BW-LABEL: store_i8_stride8_vf32: 4437; AVX512DQ-BW: # %bb.0: 4438; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 4439; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 4440; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 4441; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1 4442; AVX512DQ-BW-NEXT: vmovdqa 16(%rsi), %xmm11 4443; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm2 4444; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm12 4445; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 4446; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 4447; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 4448; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 4449; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 4450; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm3 4451; AVX512DQ-BW-NEXT: vmovdqa 16(%rcx), %xmm13 4452; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm4 4453; AVX512DQ-BW-NEXT: vmovdqa 16(%rdx), %xmm14 4454; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] 4455; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] 4456; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero 4457; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,2,2,3,4,5,6,7] 4458; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero 4459; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 4460; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7,8],ymm7[9],ymm5[10,11,12],ymm7[13],ymm5[14,15] 4461; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] 4462; AVX512DQ-BW-NEXT: vpermt2w %ymm6, %ymm22, %ymm0 4463; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 4464; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm6 4465; AVX512DQ-BW-NEXT: vmovdqa 16(%r11), %xmm15 4466; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm7 4467; AVX512DQ-BW-NEXT: vmovdqa64 16(%r10), %xmm17 4468; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] 4469; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm8 4470; AVX512DQ-BW-NEXT: vmovdqa64 16(%r9), %xmm18 4471; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm9 4472; AVX512DQ-BW-NEXT: vmovdqa64 16(%r8), %xmm19 4473; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] 4474; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] 4475; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm24, %zmm20 4476; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA 4477; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 4478; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm0 {%k1} 4479; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] 4480; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero 4481; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm16[1,1,1,1] 4482; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero 4483; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm5 4484; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 4485; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm20[0,0,2,1,4,5,6,7] 4486; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero 4487; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm23 = xmm20[0,2,2,3,4,5,6,7] 4488; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm23 = xmm23[0],zero,xmm23[1],zero 4489; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm21, %ymm10 4490; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7,8],ymm10[9],ymm5[10,11,12],ymm10[13],ymm5[14,15] 4491; AVX512DQ-BW-NEXT: vpermt2w %ymm20, %ymm22, %ymm16 4492; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm16, %zmm5, %zmm16 4493; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] 4494; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3],xmm19[4],xmm18[4],xmm19[5],xmm18[5],xmm19[6],xmm18[6],xmm19[7],xmm18[7] 4495; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm24, %zmm10 4496; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm16 {%k1} 4497; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] 4498; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero 4499; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[1,1,1,1] 4500; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero 4501; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 4502; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] 4503; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,0,2,1,4,5,6,7] 4504; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero 4505; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,2,2,3,4,5,6,7] 4506; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero 4507; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 4508; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15] 4509; AVX512DQ-BW-NEXT: vpermt2w %ymm11, %ymm22, %ymm5 4510; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 4511; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] 4512; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] 4513; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm11 4514; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} 4515; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 4516; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 4517; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] 4518; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero 4519; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm2 4520; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 4521; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] 4522; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 4523; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,2,2,3,4,5,6,7] 4524; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero 4525; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 4526; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] 4527; AVX512DQ-BW-NEXT: vpermt2w %ymm3, %ymm22, %ymm1 4528; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 4529; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 4530; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 4531; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm24, %zmm3 4532; AVX512DQ-BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} 4533; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) 4534; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 192(%rax) 4535; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 128(%rax) 4536; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) 4537; AVX512DQ-BW-NEXT: vzeroupper 4538; AVX512DQ-BW-NEXT: retq 4539; 4540; AVX512DQ-BW-FCP-LABEL: store_i8_stride8_vf32: 4541; AVX512DQ-BW-FCP: # %bb.0: 4542; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4543; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 4544; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 4545; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm0 4546; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r11), %xmm1 4547; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r10), %xmm2 4548; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r10), %xmm3 4549; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 4550; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm5 4551; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r9), %xmm6 4552; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm7 4553; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r8), %xmm8 4554; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] 4555; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] 4556; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm9 4557; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm4 4558; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rcx), %xmm11 4559; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 4560; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdx), %xmm13 4561; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15] 4562; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm15 4563; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm16 4564; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm17 4565; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm18 4566; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] 4567; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] 4568; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm20, %zmm19 4569; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA 4570; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 4571; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm19 {%k1} 4572; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 4573; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] 4574; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm10, %zmm14 4575; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] 4576; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] 4577; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 4578; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm21 {%k1} 4579; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 4580; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] 4581; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm10, %zmm3 4582; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] 4583; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] 4584; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm20, %zmm6 4585; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm6 {%k1} 4586; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 4587; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] 4588; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm1 4589; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] 4590; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] 4591; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm20, %zmm2 4592; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} 4593; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) 4594; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) 4595; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%rax) 4596; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax) 4597; AVX512DQ-BW-FCP-NEXT: vzeroupper 4598; AVX512DQ-BW-FCP-NEXT: retq 4599 %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64 4600 %in.vec1 = load <32 x i8>, ptr %in.vecptr1, align 64 4601 %in.vec2 = load <32 x i8>, ptr %in.vecptr2, align 64 4602 %in.vec3 = load <32 x i8>, ptr %in.vecptr3, align 64 4603 %in.vec4 = load <32 x i8>, ptr %in.vecptr4, align 64 4604 %in.vec5 = load <32 x i8>, ptr %in.vecptr5, align 64 4605 %in.vec6 = load <32 x i8>, ptr %in.vecptr6, align 64 4606 %in.vec7 = load <32 x i8>, ptr %in.vecptr7, align 64 4607 %1 = shufflevector <32 x i8> %in.vec0, <32 x i8> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 4608 %2 = shufflevector <32 x i8> %in.vec2, <32 x i8> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 4609 %3 = shufflevector <32 x i8> %in.vec4, <32 x i8> %in.vec5, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 4610 %4 = shufflevector <32 x i8> %in.vec6, <32 x i8> %in.vec7, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 4611 %5 = shufflevector <64 x i8> %1, <64 x i8> %2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 4612 %6 = shufflevector <64 x i8> %3, <64 x i8> %4, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 4613 %7 = shufflevector <128 x i8> %5, <128 x i8> %6, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> 4614 %interleaved.vec = shufflevector <256 x i8> %7, <256 x i8> poison, <256 x i32> <i32 0, i32 32, i32 64, i32 96, i32 128, i32 160, i32 192, i32 224, i32 1, i32 33, i32 65, i32 97, i32 129, i32 161, i32 193, i32 225, i32 2, i32 34, i32 66, i32 98, i32 130, i32 162, i32 194, i32 226, i32 3, i32 35, i32 67, i32 99, i32 131, i32 163, i32 195, i32 227, i32 4, i32 36, i32 68, i32 100, i32 132, i32 164, i32 196, i32 228, i32 5, i32 37, i32 69, i32 101, i32 133, i32 165, i32 197, i32 229, i32 6, i32 38, i32 70, i32 102, i32 134, i32 166, i32 198, i32 230, i32 7, i32 39, i32 71, i32 103, i32 135, i32 167, i32 199, i32 231, i32 8, i32 40, i32 72, i32 104, i32 136, i32 168, i32 200, i32 232, i32 9, i32 41, i32 73, i32 105, i32 137, i32 169, i32 201, i32 233, i32 10, i32 42, i32 74, i32 106, i32 138, i32 170, i32 202, i32 234, i32 11, i32 43, i32 75, i32 107, i32 139, i32 171, i32 203, i32 235, i32 12, i32 44, i32 76, i32 108, i32 140, i32 172, i32 204, i32 236, i32 13, i32 45, i32 77, i32 109, i32 141, i32 173, i32 205, i32 237, i32 14, i32 46, i32 78, i32 110, i32 142, i32 174, i32 206, i32 238, i32 15, i32 47, i32 79, i32 111, i32 143, i32 175, i32 207, i32 239, i32 16, i32 48, i32 80, i32 112, i32 144, i32 176, i32 208, i32 240, i32 17, i32 49, i32 81, i32 113, i32 145, i32 177, i32 209, i32 241, i32 18, i32 50, i32 82, i32 114, i32 146, i32 178, i32 210, i32 242, i32 19, i32 51, i32 83, i32 115, i32 147, i32 179, i32 211, i32 243, i32 20, i32 52, i32 84, i32 116, i32 148, i32 180, i32 212, i32 244, i32 21, i32 53, i32 85, i32 117, i32 149, i32 181, i32 213, i32 245, i32 22, i32 54, i32 86, i32 118, i32 150, i32 182, i32 214, i32 246, i32 23, i32 55, i32 87, i32 119, i32 151, i32 183, i32 215, i32 247, i32 24, i32 56, i32 88, i32 120, i32 152, i32 184, i32 216, i32 248, i32 25, i32 57, i32 89, i32 121, i32 153, i32 185, i32 217, i32 249, i32 26, i32 58, i32 90, i32 122, i32 154, i32 186, i32 218, i32 250, i32 27, i32 59, i32 91, i32 123, i32 155, i32 187, i32 219, i32 251, i32 28, i32 60, i32 92, i32 124, i32 156, i32 188, i32 220, i32 252, i32 29, i32 61, i32 93, i32 125, i32 157, i32 189, i32 221, i32 253, i32 30, i32 62, i32 94, i32 126, i32 158, i32 190, i32 222, i32 254, i32 31, i32 63, i32 95, i32 127, i32 159, i32 191, i32 223, i32 255> 4615 store <256 x i8> %interleaved.vec, ptr %out.vec, align 64 4616 ret void 4617} 4618 4619define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { 4620; SSE-LABEL: store_i8_stride8_vf64: 4621; SSE: # %bb.0: 4622; SSE-NEXT: subq $312, %rsp # imm = 0x138 4623; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 4624; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 4625; SSE-NEXT: movdqa (%rdi), %xmm3 4626; SSE-NEXT: movdqa (%rsi), %xmm5 4627; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4628; SSE-NEXT: movdqa (%rdx), %xmm4 4629; SSE-NEXT: movdqa (%rcx), %xmm8 4630; SSE-NEXT: movdqa (%r8), %xmm6 4631; SSE-NEXT: movdqa (%r9), %xmm9 4632; SSE-NEXT: movdqa (%r10), %xmm7 4633; SSE-NEXT: movdqa (%rax), %xmm10 4634; SSE-NEXT: movdqa %xmm7, %xmm0 4635; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 4636; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] 4637; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,1] 4638; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,65535,0] 4639; SSE-NEXT: movdqa %xmm13, %xmm12 4640; SSE-NEXT: pandn %xmm2, %xmm12 4641; SSE-NEXT: movdqa %xmm6, %xmm11 4642; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] 4643; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,1,1,3,4,5,6,7] 4644; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,0,2,1] 4645; SSE-NEXT: pand %xmm13, %xmm14 4646; SSE-NEXT: por %xmm12, %xmm14 4647; SSE-NEXT: movdqa %xmm4, %xmm12 4648; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] 4649; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,0,2,1,4,5,6,7] 4650; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] 4651; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,0,65535,65535] 4652; SSE-NEXT: movdqa %xmm1, %xmm15 4653; SSE-NEXT: pandn %xmm2, %xmm15 4654; SSE-NEXT: movdqa %xmm3, %xmm2 4655; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] 4656; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,0,0] 4657; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] 4658; SSE-NEXT: pand %xmm1, %xmm5 4659; SSE-NEXT: por %xmm15, %xmm5 4660; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[1,3,2,3] 4661; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 4662; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] 4663; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4664; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] 4665; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] 4666; SSE-NEXT: movdqa %xmm13, %xmm14 4667; SSE-NEXT: pandn %xmm5, %xmm14 4668; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[2,1,3,3,4,5,6,7] 4669; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] 4670; SSE-NEXT: pand %xmm13, %xmm5 4671; SSE-NEXT: por %xmm14, %xmm5 4672; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[0,2,2,3,4,5,6,7] 4673; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,1,3] 4674; SSE-NEXT: movdqa %xmm1, %xmm15 4675; SSE-NEXT: pandn %xmm14, %xmm15 4676; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,1,1] 4677; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] 4678; SSE-NEXT: pand %xmm1, %xmm14 4679; SSE-NEXT: por %xmm15, %xmm14 4680; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] 4681; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] 4682; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] 4683; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4684; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,4,6,5] 4685; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 4686; SSE-NEXT: movdqa %xmm13, %xmm14 4687; SSE-NEXT: pandn %xmm5, %xmm14 4688; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm11[0,1,2,3,4,5,5,7] 4689; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 4690; SSE-NEXT: pand %xmm13, %xmm5 4691; SSE-NEXT: por %xmm14, %xmm5 4692; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,4,6,5] 4693; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,3,3] 4694; SSE-NEXT: movdqa %xmm1, %xmm15 4695; SSE-NEXT: pandn %xmm14, %xmm15 4696; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,2,2] 4697; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] 4698; SSE-NEXT: pand %xmm1, %xmm14 4699; SSE-NEXT: por %xmm15, %xmm14 4700; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] 4701; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] 4702; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] 4703; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4704; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 4705; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 4706; SSE-NEXT: movdqa %xmm13, %xmm5 4707; SSE-NEXT: pandn %xmm0, %xmm5 4708; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,6,5,7,7] 4709; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3] 4710; SSE-NEXT: pand %xmm13, %xmm11 4711; SSE-NEXT: por %xmm5, %xmm11 4712; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7] 4713; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] 4714; SSE-NEXT: movdqa %xmm1, %xmm5 4715; SSE-NEXT: pandn %xmm0, %xmm5 4716; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] 4717; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,5,5,5,5] 4718; SSE-NEXT: pand %xmm1, %xmm12 4719; SSE-NEXT: por %xmm5, %xmm12 4720; SSE-NEXT: movdqa 16(%r8), %xmm0 4721; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,3,2,3] 4722; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,2,2,3] 4723; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 4724; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4725; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] 4726; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,0,2,1,4,5,6,7] 4727; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] 4728; SSE-NEXT: movdqa %xmm13, %xmm10 4729; SSE-NEXT: pandn %xmm5, %xmm10 4730; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] 4731; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7] 4732; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] 4733; SSE-NEXT: pand %xmm13, %xmm5 4734; SSE-NEXT: por %xmm10, %xmm5 4735; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] 4736; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 4737; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,0,2,1,4,5,6,7] 4738; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] 4739; SSE-NEXT: movdqa %xmm1, %xmm9 4740; SSE-NEXT: pandn %xmm8, %xmm9 4741; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 4742; SSE-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] 4743; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,0,0] 4744; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] 4745; SSE-NEXT: pand %xmm1, %xmm8 4746; SSE-NEXT: por %xmm9, %xmm8 4747; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] 4748; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 4749; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4750; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,2,3,4,5,6,7] 4751; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] 4752; SSE-NEXT: movdqa %xmm13, %xmm8 4753; SSE-NEXT: pandn %xmm5, %xmm8 4754; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7] 4755; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] 4756; SSE-NEXT: pand %xmm13, %xmm5 4757; SSE-NEXT: por %xmm8, %xmm5 4758; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] 4759; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,2,2,3,4,5,6,7] 4760; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] 4761; SSE-NEXT: movdqa %xmm1, %xmm9 4762; SSE-NEXT: pandn %xmm8, %xmm9 4763; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,1,1] 4764; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] 4765; SSE-NEXT: pand %xmm1, %xmm8 4766; SSE-NEXT: por %xmm9, %xmm8 4767; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] 4768; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 4769; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4770; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,4,6,5] 4771; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 4772; SSE-NEXT: movdqa %xmm13, %xmm8 4773; SSE-NEXT: pandn %xmm5, %xmm8 4774; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,5,7] 4775; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,2,2,3] 4776; SSE-NEXT: pand %xmm13, %xmm9 4777; SSE-NEXT: por %xmm8, %xmm9 4778; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,4,6,5] 4779; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] 4780; SSE-NEXT: movdqa %xmm1, %xmm8 4781; SSE-NEXT: pandn %xmm5, %xmm8 4782; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,2,2] 4783; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,5,5,5,5] 4784; SSE-NEXT: pand %xmm1, %xmm10 4785; SSE-NEXT: por %xmm8, %xmm10 4786; SSE-NEXT: movdqa 16(%r10), %xmm5 4787; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,3,2,3] 4788; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] 4789; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] 4790; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4791; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] 4792; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] 4793; SSE-NEXT: movdqa %xmm13, %xmm8 4794; SSE-NEXT: pandn %xmm7, %xmm8 4795; SSE-NEXT: movdqa 16(%rax), %xmm7 4796; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4797; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] 4798; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 4799; SSE-NEXT: pand %xmm13, %xmm6 4800; SSE-NEXT: por %xmm8, %xmm6 4801; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] 4802; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] 4803; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] 4804; SSE-NEXT: movdqa %xmm1, %xmm8 4805; SSE-NEXT: pandn %xmm4, %xmm8 4806; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] 4807; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] 4808; SSE-NEXT: pand %xmm1, %xmm3 4809; SSE-NEXT: por %xmm8, %xmm3 4810; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 4811; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] 4812; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4813; SSE-NEXT: movdqa %xmm5, %xmm10 4814; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] 4815; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[0,0,2,1,4,5,6,7] 4816; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] 4817; SSE-NEXT: movdqa %xmm13, %xmm4 4818; SSE-NEXT: pandn %xmm3, %xmm4 4819; SSE-NEXT: movdqa 16(%r9), %xmm6 4820; SSE-NEXT: movdqa %xmm0, %xmm11 4821; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] 4822; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,1,1,3,4,5,6,7] 4823; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[0,0,2,1] 4824; SSE-NEXT: pand %xmm13, %xmm14 4825; SSE-NEXT: por %xmm4, %xmm14 4826; SSE-NEXT: movdqa 16(%rdx), %xmm3 4827; SSE-NEXT: movdqa 16(%rcx), %xmm8 4828; SSE-NEXT: movdqa %xmm3, %xmm12 4829; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] 4830; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[0,0,2,1,4,5,6,7] 4831; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,3] 4832; SSE-NEXT: movdqa %xmm1, %xmm15 4833; SSE-NEXT: pandn %xmm4, %xmm15 4834; SSE-NEXT: movdqa 16(%rdi), %xmm4 4835; SSE-NEXT: movdqa 16(%rsi), %xmm9 4836; SSE-NEXT: movdqa %xmm4, %xmm2 4837; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] 4838; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,0,0] 4839; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] 4840; SSE-NEXT: pand %xmm1, %xmm7 4841; SSE-NEXT: por %xmm15, %xmm7 4842; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[1,3,2,3] 4843; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] 4844; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] 4845; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4846; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[0,2,2,3,4,5,6,7] 4847; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] 4848; SSE-NEXT: movdqa %xmm13, %xmm14 4849; SSE-NEXT: pandn %xmm7, %xmm14 4850; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[2,1,3,3,4,5,6,7] 4851; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] 4852; SSE-NEXT: pand %xmm13, %xmm7 4853; SSE-NEXT: por %xmm14, %xmm7 4854; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[0,2,2,3,4,5,6,7] 4855; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,1,3] 4856; SSE-NEXT: movdqa %xmm1, %xmm15 4857; SSE-NEXT: pandn %xmm14, %xmm15 4858; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,1,1] 4859; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] 4860; SSE-NEXT: pand %xmm1, %xmm14 4861; SSE-NEXT: por %xmm15, %xmm14 4862; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] 4863; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] 4864; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] 4865; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4866; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,4,4,6,5] 4867; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] 4868; SSE-NEXT: movdqa %xmm13, %xmm14 4869; SSE-NEXT: pandn %xmm7, %xmm14 4870; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm11[0,1,2,3,4,5,5,7] 4871; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] 4872; SSE-NEXT: pand %xmm13, %xmm7 4873; SSE-NEXT: por %xmm14, %xmm7 4874; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,4,6,5] 4875; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,3,3] 4876; SSE-NEXT: movdqa %xmm1, %xmm15 4877; SSE-NEXT: pandn %xmm14, %xmm15 4878; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,2,2] 4879; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] 4880; SSE-NEXT: pand %xmm1, %xmm14 4881; SSE-NEXT: por %xmm15, %xmm14 4882; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] 4883; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] 4884; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] 4885; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4886; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,4,6,6,7] 4887; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] 4888; SSE-NEXT: movdqa %xmm13, %xmm10 4889; SSE-NEXT: pandn %xmm7, %xmm10 4890; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm11[0,1,2,3,6,5,7,7] 4891; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] 4892; SSE-NEXT: pand %xmm13, %xmm7 4893; SSE-NEXT: por %xmm10, %xmm7 4894; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm12[0,1,2,3,4,6,6,7] 4895; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,3] 4896; SSE-NEXT: movdqa %xmm1, %xmm11 4897; SSE-NEXT: pandn %xmm10, %xmm11 4898; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[3,3,3,3] 4899; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] 4900; SSE-NEXT: pand %xmm1, %xmm10 4901; SSE-NEXT: por %xmm11, %xmm10 4902; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] 4903; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] 4904; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] 4905; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4906; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 4907; SSE-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] 4908; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,0,2,1,4,5,6,7] 4909; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] 4910; SSE-NEXT: movdqa %xmm13, %xmm10 4911; SSE-NEXT: pandn %xmm7, %xmm10 4912; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] 4913; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,1,1,3,4,5,6,7] 4914; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] 4915; SSE-NEXT: pand %xmm13, %xmm6 4916; SSE-NEXT: por %xmm10, %xmm6 4917; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] 4918; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,0,2,1,4,5,6,7] 4919; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] 4920; SSE-NEXT: movdqa %xmm1, %xmm8 4921; SSE-NEXT: pandn %xmm7, %xmm8 4922; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 4923; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,0,0] 4924; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] 4925; SSE-NEXT: pand %xmm1, %xmm7 4926; SSE-NEXT: por %xmm8, %xmm7 4927; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] 4928; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] 4929; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] 4930; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4931; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7] 4932; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] 4933; SSE-NEXT: movdqa %xmm13, %xmm7 4934; SSE-NEXT: pandn %xmm6, %xmm7 4935; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,1,3,3,4,5,6,7] 4936; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] 4937; SSE-NEXT: pand %xmm13, %xmm6 4938; SSE-NEXT: por %xmm7, %xmm6 4939; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] 4940; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] 4941; SSE-NEXT: movdqa %xmm1, %xmm8 4942; SSE-NEXT: pandn %xmm7, %xmm8 4943; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] 4944; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] 4945; SSE-NEXT: pand %xmm1, %xmm7 4946; SSE-NEXT: por %xmm8, %xmm7 4947; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] 4948; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] 4949; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] 4950; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4951; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] 4952; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 4953; SSE-NEXT: movdqa %xmm13, %xmm7 4954; SSE-NEXT: pandn %xmm6, %xmm7 4955; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,5,7] 4956; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 4957; SSE-NEXT: pand %xmm13, %xmm6 4958; SSE-NEXT: por %xmm7, %xmm6 4959; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,4,6,5] 4960; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] 4961; SSE-NEXT: movdqa %xmm1, %xmm8 4962; SSE-NEXT: pandn %xmm7, %xmm8 4963; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] 4964; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] 4965; SSE-NEXT: pand %xmm1, %xmm7 4966; SSE-NEXT: por %xmm8, %xmm7 4967; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] 4968; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] 4969; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] 4970; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4971; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] 4972; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 4973; SSE-NEXT: movdqa %xmm13, %xmm6 4974; SSE-NEXT: pandn %xmm5, %xmm6 4975; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] 4976; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 4977; SSE-NEXT: pand %xmm13, %xmm0 4978; SSE-NEXT: por %xmm6, %xmm0 4979; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 4980; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] 4981; SSE-NEXT: movdqa %xmm1, %xmm5 4982; SSE-NEXT: pandn %xmm3, %xmm5 4983; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] 4984; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] 4985; SSE-NEXT: pand %xmm1, %xmm3 4986; SSE-NEXT: por %xmm5, %xmm3 4987; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 4988; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 4989; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 4990; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4991; SSE-NEXT: movdqa 32(%r10), %xmm0 4992; SSE-NEXT: movdqa 32(%rax), %xmm2 4993; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4994; SSE-NEXT: movdqa %xmm0, %xmm10 4995; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] 4996; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[0,0,2,1,4,5,6,7] 4997; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] 4998; SSE-NEXT: movdqa %xmm13, %xmm4 4999; SSE-NEXT: pandn %xmm3, %xmm4 5000; SSE-NEXT: movdqa 32(%r8), %xmm3 5001; SSE-NEXT: movdqa 32(%r9), %xmm7 5002; SSE-NEXT: movdqa %xmm3, %xmm11 5003; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] 5004; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[0,1,1,3,4,5,6,7] 5005; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,0,2,1] 5006; SSE-NEXT: pand %xmm13, %xmm14 5007; SSE-NEXT: por %xmm4, %xmm14 5008; SSE-NEXT: movdqa 32(%rdx), %xmm4 5009; SSE-NEXT: movdqa 32(%rcx), %xmm8 5010; SSE-NEXT: movdqa %xmm4, %xmm12 5011; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] 5012; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm12[0,0,2,1,4,5,6,7] 5013; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,3] 5014; SSE-NEXT: movdqa %xmm1, %xmm15 5015; SSE-NEXT: pandn %xmm5, %xmm15 5016; SSE-NEXT: movdqa 32(%rdi), %xmm5 5017; SSE-NEXT: movdqa 32(%rsi), %xmm9 5018; SSE-NEXT: movdqa %xmm5, %xmm2 5019; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] 5020; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,0,0] 5021; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] 5022; SSE-NEXT: pand %xmm1, %xmm6 5023; SSE-NEXT: por %xmm15, %xmm6 5024; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[1,3,2,3] 5025; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 5026; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] 5027; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5028; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,2,2,3,4,5,6,7] 5029; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] 5030; SSE-NEXT: movdqa %xmm13, %xmm14 5031; SSE-NEXT: pandn %xmm6, %xmm14 5032; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[2,1,3,3,4,5,6,7] 5033; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] 5034; SSE-NEXT: pand %xmm13, %xmm6 5035; SSE-NEXT: por %xmm14, %xmm6 5036; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[0,2,2,3,4,5,6,7] 5037; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,1,3] 5038; SSE-NEXT: movdqa %xmm1, %xmm15 5039; SSE-NEXT: pandn %xmm14, %xmm15 5040; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,1,1] 5041; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] 5042; SSE-NEXT: pand %xmm1, %xmm14 5043; SSE-NEXT: por %xmm15, %xmm14 5044; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] 5045; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] 5046; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] 5047; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5048; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,4,6,5] 5049; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 5050; SSE-NEXT: movdqa %xmm13, %xmm14 5051; SSE-NEXT: pandn %xmm6, %xmm14 5052; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,4,5,5,7] 5053; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 5054; SSE-NEXT: pand %xmm13, %xmm6 5055; SSE-NEXT: por %xmm14, %xmm6 5056; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,4,6,5] 5057; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,3,3] 5058; SSE-NEXT: movdqa %xmm1, %xmm15 5059; SSE-NEXT: pandn %xmm14, %xmm15 5060; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,2,2] 5061; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] 5062; SSE-NEXT: pand %xmm1, %xmm14 5063; SSE-NEXT: por %xmm15, %xmm14 5064; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] 5065; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] 5066; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] 5067; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5068; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,6,6,7] 5069; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 5070; SSE-NEXT: movdqa %xmm13, %xmm10 5071; SSE-NEXT: pandn %xmm6, %xmm10 5072; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,6,5,7,7] 5073; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 5074; SSE-NEXT: pand %xmm13, %xmm6 5075; SSE-NEXT: por %xmm10, %xmm6 5076; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm12[0,1,2,3,4,6,6,7] 5077; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,3] 5078; SSE-NEXT: movdqa %xmm1, %xmm11 5079; SSE-NEXT: pandn %xmm10, %xmm11 5080; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[3,3,3,3] 5081; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] 5082; SSE-NEXT: pand %xmm1, %xmm10 5083; SSE-NEXT: por %xmm11, %xmm10 5084; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] 5085; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] 5086; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] 5087; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5088; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 5089; SSE-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] 5090; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,0,2,1,4,5,6,7] 5091; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] 5092; SSE-NEXT: movdqa %xmm13, %xmm10 5093; SSE-NEXT: pandn %xmm6, %xmm10 5094; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] 5095; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] 5096; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] 5097; SSE-NEXT: pand %xmm13, %xmm6 5098; SSE-NEXT: por %xmm10, %xmm6 5099; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 5100; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,0,2,1,4,5,6,7] 5101; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] 5102; SSE-NEXT: movdqa %xmm1, %xmm8 5103; SSE-NEXT: pandn %xmm7, %xmm8 5104; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] 5105; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,0,0] 5106; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] 5107; SSE-NEXT: pand %xmm1, %xmm7 5108; SSE-NEXT: por %xmm8, %xmm7 5109; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] 5110; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] 5111; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] 5112; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5113; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] 5114; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] 5115; SSE-NEXT: movdqa %xmm13, %xmm7 5116; SSE-NEXT: pandn %xmm6, %xmm7 5117; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,1,3,3,4,5,6,7] 5118; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] 5119; SSE-NEXT: pand %xmm13, %xmm6 5120; SSE-NEXT: por %xmm7, %xmm6 5121; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,2,2,3,4,5,6,7] 5122; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] 5123; SSE-NEXT: movdqa %xmm1, %xmm8 5124; SSE-NEXT: pandn %xmm7, %xmm8 5125; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] 5126; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] 5127; SSE-NEXT: pand %xmm1, %xmm7 5128; SSE-NEXT: por %xmm8, %xmm7 5129; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] 5130; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] 5131; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] 5132; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill 5133; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,4,6,5] 5134; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 5135; SSE-NEXT: movdqa %xmm13, %xmm7 5136; SSE-NEXT: pandn %xmm6, %xmm7 5137; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] 5138; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 5139; SSE-NEXT: pand %xmm13, %xmm6 5140; SSE-NEXT: por %xmm7, %xmm6 5141; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,4,6,5] 5142; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] 5143; SSE-NEXT: movdqa %xmm1, %xmm8 5144; SSE-NEXT: pandn %xmm7, %xmm8 5145; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] 5146; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] 5147; SSE-NEXT: pand %xmm1, %xmm7 5148; SSE-NEXT: por %xmm8, %xmm7 5149; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] 5150; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] 5151; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] 5152; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5153; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 5154; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 5155; SSE-NEXT: movdqa %xmm13, %xmm6 5156; SSE-NEXT: pandn %xmm0, %xmm6 5157; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,6,5,7,7] 5158; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 5159; SSE-NEXT: pand %xmm13, %xmm0 5160; SSE-NEXT: por %xmm6, %xmm0 5161; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,6,6,7] 5162; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] 5163; SSE-NEXT: movdqa %xmm1, %xmm4 5164; SSE-NEXT: pandn %xmm3, %xmm4 5165; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[3,3,3,3] 5166; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] 5167; SSE-NEXT: pand %xmm1, %xmm3 5168; SSE-NEXT: por %xmm4, %xmm3 5169; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 5170; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 5171; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 5172; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5173; SSE-NEXT: movdqa 48(%r10), %xmm9 5174; SSE-NEXT: movdqa 48(%rax), %xmm0 5175; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5176; SSE-NEXT: movdqa %xmm9, %xmm6 5177; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] 5178; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7] 5179; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 5180; SSE-NEXT: movdqa %xmm13, %xmm3 5181; SSE-NEXT: pandn %xmm0, %xmm3 5182; SSE-NEXT: movdqa 48(%r8), %xmm8 5183; SSE-NEXT: movdqa 48(%r9), %xmm0 5184; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5185; SSE-NEXT: movdqa %xmm8, %xmm4 5186; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 5187; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,3,4,5,6,7] 5188; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,1] 5189; SSE-NEXT: pand %xmm13, %xmm10 5190; SSE-NEXT: por %xmm3, %xmm10 5191; SSE-NEXT: movdqa 48(%rdx), %xmm7 5192; SSE-NEXT: movdqa 48(%rcx), %xmm12 5193; SSE-NEXT: movdqa %xmm7, %xmm3 5194; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] 5195; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] 5196; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 5197; SSE-NEXT: movdqa %xmm1, %xmm15 5198; SSE-NEXT: pandn %xmm0, %xmm15 5199; SSE-NEXT: movdqa 48(%rdi), %xmm5 5200; SSE-NEXT: movdqa 48(%rsi), %xmm11 5201; SSE-NEXT: movdqa %xmm5, %xmm0 5202; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] 5203; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,0,0] 5204; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] 5205; SSE-NEXT: pand %xmm1, %xmm14 5206; SSE-NEXT: por %xmm15, %xmm14 5207; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,3,2,3] 5208; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] 5209; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] 5210; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5211; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[0,2,2,3,4,5,6,7] 5212; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] 5213; SSE-NEXT: movdqa %xmm13, %xmm14 5214; SSE-NEXT: pandn %xmm10, %xmm14 5215; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm4[2,1,3,3,4,5,6,7] 5216; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] 5217; SSE-NEXT: pand %xmm13, %xmm10 5218; SSE-NEXT: por %xmm14, %xmm10 5219; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm3[0,2,2,3,4,5,6,7] 5220; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,1,3] 5221; SSE-NEXT: movdqa %xmm1, %xmm15 5222; SSE-NEXT: pandn %xmm14, %xmm15 5223; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,1,1] 5224; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] 5225; SSE-NEXT: pand %xmm1, %xmm14 5226; SSE-NEXT: por %xmm15, %xmm14 5227; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,3,2,3] 5228; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,2,2,3] 5229; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] 5230; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,4,4,6,5] 5231; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] 5232; SSE-NEXT: movdqa %xmm13, %xmm14 5233; SSE-NEXT: pandn %xmm10, %xmm14 5234; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,5,5,7] 5235; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] 5236; SSE-NEXT: pand %xmm13, %xmm10 5237; SSE-NEXT: por %xmm14, %xmm10 5238; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,4,6,5] 5239; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,3,3] 5240; SSE-NEXT: movdqa %xmm1, %xmm2 5241; SSE-NEXT: pandn %xmm14, %xmm2 5242; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[2,2,2,2] 5243; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] 5244; SSE-NEXT: pand %xmm1, %xmm14 5245; SSE-NEXT: por %xmm2, %xmm14 5246; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,3,2,3] 5247; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[0,2,2,3] 5248; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] 5249; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,6,6,7] 5250; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 5251; SSE-NEXT: movdqa %xmm13, %xmm6 5252; SSE-NEXT: pandn %xmm2, %xmm6 5253; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,7,7] 5254; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 5255; SSE-NEXT: pand %xmm13, %xmm2 5256; SSE-NEXT: por %xmm6, %xmm2 5257; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 5258; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] 5259; SSE-NEXT: movdqa %xmm1, %xmm4 5260; SSE-NEXT: pandn %xmm3, %xmm4 5261; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 5262; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 5263; SSE-NEXT: pand %xmm1, %xmm0 5264; SSE-NEXT: por %xmm4, %xmm0 5265; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] 5266; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] 5267; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] 5268; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 5269; SSE-NEXT: # xmm9 = xmm9[8],mem[8],xmm9[9],mem[9],xmm9[10],mem[10],xmm9[11],mem[11],xmm9[12],mem[12],xmm9[13],mem[13],xmm9[14],mem[14],xmm9[15],mem[15] 5270; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] 5271; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 5272; SSE-NEXT: movdqa %xmm13, %xmm2 5273; SSE-NEXT: pandn %xmm0, %xmm2 5274; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 5275; SSE-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] 5276; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,3,4,5,6,7] 5277; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 5278; SSE-NEXT: pand %xmm13, %xmm0 5279; SSE-NEXT: por %xmm2, %xmm0 5280; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] 5281; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,0,2,1,4,5,6,7] 5282; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] 5283; SSE-NEXT: movdqa %xmm1, %xmm3 5284; SSE-NEXT: pandn %xmm2, %xmm3 5285; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] 5286; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,0,0] 5287; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 5288; SSE-NEXT: pand %xmm1, %xmm2 5289; SSE-NEXT: por %xmm3, %xmm2 5290; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 5291; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] 5292; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 5293; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] 5294; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 5295; SSE-NEXT: movdqa %xmm13, %xmm2 5296; SSE-NEXT: pandn %xmm0, %xmm2 5297; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,1,3,3,4,5,6,7] 5298; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 5299; SSE-NEXT: pand %xmm13, %xmm0 5300; SSE-NEXT: por %xmm2, %xmm0 5301; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,2,3,4,5,6,7] 5302; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] 5303; SSE-NEXT: movdqa %xmm1, %xmm3 5304; SSE-NEXT: pandn %xmm2, %xmm3 5305; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,1,1] 5306; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 5307; SSE-NEXT: pand %xmm1, %xmm2 5308; SSE-NEXT: por %xmm3, %xmm2 5309; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 5310; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 5311; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 5312; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,4,6,5] 5313; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 5314; SSE-NEXT: movdqa %xmm13, %xmm3 5315; SSE-NEXT: pandn %xmm2, %xmm3 5316; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,5,7] 5317; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 5318; SSE-NEXT: pand %xmm13, %xmm2 5319; SSE-NEXT: por %xmm3, %xmm2 5320; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] 5321; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] 5322; SSE-NEXT: movdqa %xmm1, %xmm11 5323; SSE-NEXT: pandn %xmm3, %xmm11 5324; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,2,2] 5325; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] 5326; SSE-NEXT: pand %xmm1, %xmm3 5327; SSE-NEXT: por %xmm11, %xmm3 5328; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] 5329; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 5330; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 5331; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7] 5332; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 5333; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,7,7] 5334; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 5335; SSE-NEXT: pand %xmm13, %xmm8 5336; SSE-NEXT: pandn %xmm2, %xmm13 5337; SSE-NEXT: por %xmm8, %xmm13 5338; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,6,6,7] 5339; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] 5340; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] 5341; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] 5342; SSE-NEXT: pand %xmm1, %xmm5 5343; SSE-NEXT: pandn %xmm2, %xmm1 5344; SSE-NEXT: por %xmm5, %xmm1 5345; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,3,2,3] 5346; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 5347; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 5348; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 5349; SSE-NEXT: movdqa %xmm1, 496(%rax) 5350; SSE-NEXT: movdqa %xmm3, 480(%rax) 5351; SSE-NEXT: movdqa %xmm0, 464(%rax) 5352; SSE-NEXT: movdqa %xmm4, 448(%rax) 5353; SSE-NEXT: movdqa %xmm6, 432(%rax) 5354; SSE-NEXT: movdqa %xmm10, 416(%rax) 5355; SSE-NEXT: movdqa %xmm15, 400(%rax) 5356; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5357; SSE-NEXT: movaps %xmm0, 384(%rax) 5358; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5359; SSE-NEXT: movaps %xmm0, 368(%rax) 5360; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5361; SSE-NEXT: movaps %xmm0, 352(%rax) 5362; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 5363; SSE-NEXT: movaps %xmm0, 336(%rax) 5364; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5365; SSE-NEXT: movaps %xmm0, 320(%rax) 5366; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5367; SSE-NEXT: movaps %xmm0, 304(%rax) 5368; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5369; SSE-NEXT: movaps %xmm0, 288(%rax) 5370; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5371; SSE-NEXT: movaps %xmm0, 272(%rax) 5372; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5373; SSE-NEXT: movaps %xmm0, 256(%rax) 5374; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5375; SSE-NEXT: movaps %xmm0, 240(%rax) 5376; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5377; SSE-NEXT: movaps %xmm0, 224(%rax) 5378; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5379; SSE-NEXT: movaps %xmm0, 208(%rax) 5380; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5381; SSE-NEXT: movaps %xmm0, 192(%rax) 5382; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5383; SSE-NEXT: movaps %xmm0, 176(%rax) 5384; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5385; SSE-NEXT: movaps %xmm0, 160(%rax) 5386; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5387; SSE-NEXT: movaps %xmm0, 144(%rax) 5388; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5389; SSE-NEXT: movaps %xmm0, 128(%rax) 5390; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5391; SSE-NEXT: movaps %xmm0, 112(%rax) 5392; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5393; SSE-NEXT: movaps %xmm0, 96(%rax) 5394; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5395; SSE-NEXT: movaps %xmm0, 80(%rax) 5396; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5397; SSE-NEXT: movaps %xmm0, 64(%rax) 5398; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5399; SSE-NEXT: movaps %xmm0, 48(%rax) 5400; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5401; SSE-NEXT: movaps %xmm0, 32(%rax) 5402; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5403; SSE-NEXT: movaps %xmm0, 16(%rax) 5404; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5405; SSE-NEXT: movaps %xmm0, (%rax) 5406; SSE-NEXT: addq $312, %rsp # imm = 0x138 5407; SSE-NEXT: retq 5408; 5409; AVX-LABEL: store_i8_stride8_vf64: 5410; AVX: # %bb.0: 5411; AVX-NEXT: subq $328, %rsp # imm = 0x148 5412; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 5413; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 5414; AVX-NEXT: vmovdqa (%r10), %xmm1 5415; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill 5416; AVX-NEXT: vmovdqa (%rax), %xmm0 5417; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5418; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 5419; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 5420; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] 5421; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 5422; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 5423; AVX-NEXT: vbroadcastsd {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] 5424; AVX-NEXT: vandnps %ymm1, %ymm9, %ymm2 5425; AVX-NEXT: vmovdqa (%r9), %xmm3 5426; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5427; AVX-NEXT: vmovdqa (%r8), %xmm1 5428; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5429; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 5430; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] 5431; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,6,5,7,7] 5432; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 5433; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 5434; AVX-NEXT: vandps %ymm3, %ymm9, %ymm3 5435; AVX-NEXT: vorps %ymm2, %ymm3, %ymm3 5436; AVX-NEXT: vmovdqa (%rcx), %xmm4 5437; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5438; AVX-NEXT: vmovdqa (%rdx), %xmm2 5439; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5440; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 5441; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] 5442; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,6,6,7] 5443; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 5444; AVX-NEXT: vmovdqa (%rsi), %xmm6 5445; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5446; AVX-NEXT: vmovdqa (%rdi), %xmm5 5447; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5448; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] 5449; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] 5450; AVX-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero 5451; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[3,3,3,3] 5452; AVX-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero 5453; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 5454; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] 5455; AVX-NEXT: vbroadcastsd {{.*#+}} ymm14 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] 5456; AVX-NEXT: vandnps %ymm4, %ymm14, %ymm4 5457; AVX-NEXT: vandps %ymm6, %ymm14, %ymm6 5458; AVX-NEXT: vorps %ymm4, %ymm6, %ymm4 5459; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] 5460; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5461; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] 5462; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 5463; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 5464; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,1,3,4,5,6,7] 5465; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] 5466; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3 5467; AVX-NEXT: vmovdqa 48(%r10), %xmm1 5468; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5469; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] 5470; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0 5471; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] 5472; AVX-NEXT: vandps %ymm3, %ymm9, %ymm3 5473; AVX-NEXT: vorps %ymm0, %ymm3, %ymm0 5474; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero 5475; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] 5476; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 5477; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 5478; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7] 5479; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 5480; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] 5481; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 5482; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm4 5483; AVX-NEXT: vmovdqa 48(%rax), %xmm2 5484; AVX-NEXT: vandps %ymm3, %ymm14, %ymm3 5485; AVX-NEXT: vandnps %ymm4, %ymm14, %ymm4 5486; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 5487; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] 5488; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5489; AVX-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 5490; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] 5491; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7] 5492; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 5493; AVX-NEXT: vmovdqa 48(%r9), %xmm12 5494; AVX-NEXT: vmovdqa 48(%r8), %xmm3 5495; AVX-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15] 5496; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,1,3,4,5,6,7] 5497; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[2,1,3,3,4,5,6,7] 5498; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 5499; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] 5500; AVX-NEXT: vandnps %ymm4, %ymm9, %ymm4 5501; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] 5502; AVX-NEXT: vandps %ymm5, %ymm9, %ymm5 5503; AVX-NEXT: vorps %ymm4, %ymm5, %ymm9 5504; AVX-NEXT: vmovdqa 48(%rsi), %xmm4 5505; AVX-NEXT: vmovdqa 48(%rdi), %xmm6 5506; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] 5507; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[1,1,1,1] 5508; AVX-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero 5509; AVX-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero 5510; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm0 5511; AVX-NEXT: vmovdqa 48(%rcx), %xmm5 5512; AVX-NEXT: vmovdqa 48(%rdx), %xmm7 5513; AVX-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] 5514; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,1,4,5,6,7] 5515; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 5516; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[0,2,2,3,4,5,6,7] 5517; AVX-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero 5518; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1 5519; AVX-NEXT: vandps %ymm0, %ymm14, %ymm13 5520; AVX-NEXT: vandnps %ymm1, %ymm14, %ymm1 5521; AVX-NEXT: vorps %ymm1, %ymm13, %ymm1 5522; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4],ymm9[5],ymm1[6],ymm9[7] 5523; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5524; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] 5525; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,6,6,7] 5526; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 5527; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,4,5,5,7] 5528; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,7,7] 5529; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 5530; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 5531; AVX-NEXT: vbroadcastsd {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] 5532; AVX-NEXT: vandnps %ymm1, %ymm11, %ymm1 5533; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] 5534; AVX-NEXT: vandps %ymm11, %ymm8, %ymm8 5535; AVX-NEXT: vorps %ymm1, %ymm8, %ymm1 5536; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,4,6,5] 5537; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7] 5538; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 5539; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] 5540; AVX-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero 5541; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] 5542; AVX-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero 5543; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 5544; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] 5545; AVX-NEXT: vandnps %ymm8, %ymm14, %ymm8 5546; AVX-NEXT: vandps %ymm14, %ymm9, %ymm9 5547; AVX-NEXT: vorps %ymm8, %ymm9, %ymm8 5548; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2],ymm1[3],ymm8[4],ymm1[5],ymm8[6],ymm1[7] 5549; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5550; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload 5551; AVX-NEXT: # xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] 5552; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] 5553; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[0,2,2,3,4,5,6,7] 5554; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm8 5555; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] 5556; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,1,3,4,5,6,7] 5557; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] 5558; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 5559; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,1,4,4,6,5] 5560; AVX-NEXT: vmovaps %ymm11, %ymm12 5561; AVX-NEXT: vandnps %ymm3, %ymm11, %ymm3 5562; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] 5563; AVX-NEXT: vandps %ymm0, %ymm11, %ymm0 5564; AVX-NEXT: vorps %ymm3, %ymm0, %ymm3 5565; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] 5566; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 5567; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 5568; AVX-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 5569; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 5570; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] 5571; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] 5572; AVX-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero 5573; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] 5574; AVX-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero 5575; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 5576; AVX-NEXT: vandps %ymm4, %ymm14, %ymm4 5577; AVX-NEXT: vandnps %ymm6, %ymm14, %ymm6 5578; AVX-NEXT: vorps %ymm6, %ymm4, %ymm4 5579; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] 5580; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5581; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] 5582; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 5583; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 5584; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] 5585; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] 5586; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 5587; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 5588; AVX-NEXT: vandnps %ymm1, %ymm11, %ymm1 5589; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 5590; AVX-NEXT: vandps %ymm2, %ymm11, %ymm2 5591; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 5592; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,6,5] 5593; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7] 5594; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 5595; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 5596; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 5597; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 5598; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 5599; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 5600; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] 5601; AVX-NEXT: vandnps %ymm2, %ymm14, %ymm2 5602; AVX-NEXT: vandps %ymm0, %ymm14, %ymm0 5603; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 5604; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 5605; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5606; AVX-NEXT: vmovdqa 32(%r10), %xmm1 5607; AVX-NEXT: vmovdqa 32(%rax), %xmm2 5608; AVX-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 5609; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] 5610; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7] 5611; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 5612; AVX-NEXT: vmovdqa 32(%r9), %xmm0 5613; AVX-NEXT: vmovdqa 32(%r8), %xmm3 5614; AVX-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 5615; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,1,3,4,5,6,7] 5616; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[2,1,3,3,4,5,6,7] 5617; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 5618; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] 5619; AVX-NEXT: vandnps %ymm4, %ymm12, %ymm4 5620; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] 5621; AVX-NEXT: vandps %ymm5, %ymm12, %ymm5 5622; AVX-NEXT: vorps %ymm4, %ymm5, %ymm9 5623; AVX-NEXT: vmovdqa 32(%rsi), %xmm4 5624; AVX-NEXT: vmovdqa 32(%rdi), %xmm6 5625; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] 5626; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[1,1,1,1] 5627; AVX-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero 5628; AVX-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero 5629; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm13 5630; AVX-NEXT: vmovdqa 32(%rcx), %xmm5 5631; AVX-NEXT: vmovdqa 32(%rdx), %xmm7 5632; AVX-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] 5633; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[0,0,2,1,4,5,6,7] 5634; AVX-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero 5635; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm14[0,2,2,3,4,5,6,7] 5636; AVX-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero 5637; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm12 5638; AVX-NEXT: vbroadcastsd {{.*#+}} ymm15 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] 5639; AVX-NEXT: vandps %ymm15, %ymm13, %ymm13 5640; AVX-NEXT: vandnps %ymm12, %ymm15, %ymm12 5641; AVX-NEXT: vorps %ymm12, %ymm13, %ymm12 5642; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] 5643; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5644; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,4,6,5] 5645; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,6,7] 5646; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 5647; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,4,5,5,7] 5648; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,7,7] 5649; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8 5650; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] 5651; AVX-NEXT: vbroadcastsd {{.*#+}} ymm12 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] 5652; AVX-NEXT: vandnps %ymm9, %ymm12, %ymm9 5653; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] 5654; AVX-NEXT: vandps %ymm12, %ymm8, %ymm8 5655; AVX-NEXT: vorps %ymm9, %ymm8, %ymm8 5656; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm14[0,1,2,3,4,4,6,5] 5657; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm14[0,1,2,3,4,6,6,7] 5658; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 5659; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] 5660; AVX-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero 5661; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] 5662; AVX-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero 5663; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 5664; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] 5665; AVX-NEXT: vandnps %ymm9, %ymm15, %ymm9 5666; AVX-NEXT: vandps %ymm15, %ymm10, %ymm10 5667; AVX-NEXT: vorps %ymm9, %ymm10, %ymm9 5668; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] 5669; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5670; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 5671; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] 5672; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[0,2,2,3,4,5,6,7] 5673; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm8 5674; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 5675; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,1,3,4,5,6,7] 5676; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] 5677; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 5678; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,1,4,4,6,5] 5679; AVX-NEXT: vandnps %ymm3, %ymm12, %ymm3 5680; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] 5681; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0 5682; AVX-NEXT: vorps %ymm3, %ymm0, %ymm3 5683; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] 5684; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 5685; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 5686; AVX-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 5687; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 5688; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] 5689; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] 5690; AVX-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero 5691; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] 5692; AVX-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero 5693; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 5694; AVX-NEXT: vandps %ymm4, %ymm15, %ymm4 5695; AVX-NEXT: vandnps %ymm6, %ymm15, %ymm6 5696; AVX-NEXT: vorps %ymm6, %ymm4, %ymm4 5697; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] 5698; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5699; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] 5700; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 5701; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 5702; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] 5703; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] 5704; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 5705; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 5706; AVX-NEXT: vandnps %ymm1, %ymm12, %ymm1 5707; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 5708; AVX-NEXT: vandps %ymm2, %ymm12, %ymm2 5709; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 5710; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,6,5] 5711; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7] 5712; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 5713; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 5714; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 5715; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 5716; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 5717; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 5718; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] 5719; AVX-NEXT: vandnps %ymm2, %ymm15, %ymm2 5720; AVX-NEXT: vandps %ymm0, %ymm15, %ymm0 5721; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 5722; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 5723; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5724; AVX-NEXT: vmovdqa 16(%r10), %xmm9 5725; AVX-NEXT: vmovdqa 16(%rax), %xmm6 5726; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] 5727; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] 5728; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] 5729; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 5730; AVX-NEXT: vmovdqa 16(%r9), %xmm8 5731; AVX-NEXT: vmovdqa 16(%r8), %xmm7 5732; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] 5733; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,1,3,4,5,6,7] 5734; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[2,1,3,3,4,5,6,7] 5735; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 5736; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] 5737; AVX-NEXT: vandnps %ymm2, %ymm12, %ymm2 5738; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] 5739; AVX-NEXT: vandps %ymm3, %ymm12, %ymm3 5740; AVX-NEXT: vorps %ymm2, %ymm3, %ymm11 5741; AVX-NEXT: vmovdqa 16(%rsi), %xmm5 5742; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 5743; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] 5744; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[1,1,1,1] 5745; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 5746; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero 5747; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm12 5748; AVX-NEXT: vmovdqa 16(%rcx), %xmm4 5749; AVX-NEXT: vmovdqa 16(%rdx), %xmm2 5750; AVX-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 5751; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,0,2,1,4,5,6,7] 5752; AVX-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero 5753; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,2,2,3,4,5,6,7] 5754; AVX-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero 5755; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 5756; AVX-NEXT: vbroadcastsd {{.*#+}} ymm15 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] 5757; AVX-NEXT: vandps %ymm15, %ymm12, %ymm12 5758; AVX-NEXT: vandnps %ymm14, %ymm15, %ymm14 5759; AVX-NEXT: vorps %ymm14, %ymm12, %ymm12 5760; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] 5761; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm1[0,1,2,3,4,4,6,5] 5762; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 5763; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 5764; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,5,5,7] 5765; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] 5766; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 5767; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 5768; AVX-NEXT: vbroadcastsd {{.*#+}} ymm14 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] 5769; AVX-NEXT: vandnps %ymm1, %ymm14, %ymm1 5770; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 5771; AVX-NEXT: vandps %ymm0, %ymm14, %ymm0 5772; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 5773; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,4,6,5] 5774; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,6,6,7] 5775; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 5776; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[2,3,2,3] 5777; AVX-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero 5778; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] 5779; AVX-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero 5780; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 5781; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] 5782; AVX-NEXT: vandnps %ymm1, %ymm15, %ymm1 5783; AVX-NEXT: vandps %ymm15, %ymm10, %ymm10 5784; AVX-NEXT: vorps %ymm1, %ymm10, %ymm1 5785; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 5786; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] 5787; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7] 5788; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7] 5789; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 5790; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] 5791; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,1,3,4,5,6,7] 5792; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,3,3,4,5,6,7] 5793; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 5794; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] 5795; AVX-NEXT: vandnps %ymm0, %ymm14, %ymm0 5796; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] 5797; AVX-NEXT: vandps %ymm1, %ymm14, %ymm1 5798; AVX-NEXT: vorps %ymm0, %ymm1, %ymm1 5799; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 5800; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 5801; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 5802; AVX-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 5803; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 5804; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 5805; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7] 5806; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 5807; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7] 5808; AVX-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero 5809; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 5810; AVX-NEXT: vandps %ymm3, %ymm15, %ymm3 5811; AVX-NEXT: vandnps %ymm4, %ymm15, %ymm4 5812; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 5813; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] 5814; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,4,6,5] 5815; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,6,6,7] 5816; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 5817; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,5,7] 5818; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,6,5,7,7] 5819; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 5820; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 5821; AVX-NEXT: vandnps %ymm3, %ymm14, %ymm3 5822; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] 5823; AVX-NEXT: vandps %ymm4, %ymm14, %ymm4 5824; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3 5825; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] 5826; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 5827; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 5828; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 5829; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 5830; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 5831; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 5832; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 5833; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] 5834; AVX-NEXT: vandnps %ymm2, %ymm15, %ymm2 5835; AVX-NEXT: vandps %ymm0, %ymm15, %ymm0 5836; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 5837; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] 5838; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5839; AVX-NEXT: vpunpckhbw (%rsp), %xmm2, %xmm3 # 16-byte Folded Reload 5840; AVX-NEXT: # xmm3 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] 5841; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] 5842; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7] 5843; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 5844; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5845; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 5846; AVX-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] 5847; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] 5848; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,3,3,4,5,6,7] 5849; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 5850; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] 5851; AVX-NEXT: vandnps %ymm2, %ymm14, %ymm2 5852; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] 5853; AVX-NEXT: vandps %ymm5, %ymm14, %ymm5 5854; AVX-NEXT: vorps %ymm2, %ymm5, %ymm5 5855; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5856; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 5857; AVX-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] 5858; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,1,1] 5859; AVX-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero 5860; AVX-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 5861; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 5862; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 5863; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload 5864; AVX-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] 5865; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] 5866; AVX-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero 5867; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,2,2,3,4,5,6,7] 5868; AVX-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero 5869; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 5870; AVX-NEXT: vandps %ymm6, %ymm15, %ymm6 5871; AVX-NEXT: vandnps %ymm8, %ymm15, %ymm8 5872; AVX-NEXT: vorps %ymm6, %ymm8, %ymm6 5873; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] 5874; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,4,6,5] 5875; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 5876; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 5877; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,5,7] 5878; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] 5879; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 5880; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 5881; AVX-NEXT: vandnps %ymm3, %ymm14, %ymm3 5882; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] 5883; AVX-NEXT: vandps %ymm4, %ymm14, %ymm4 5884; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3 5885; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,4,6,5] 5886; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,6,6,7] 5887; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 5888; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] 5889; AVX-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero 5890; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 5891; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 5892; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 5893; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] 5894; AVX-NEXT: vandnps %ymm4, %ymm15, %ymm4 5895; AVX-NEXT: vandps %ymm2, %ymm15, %ymm2 5896; AVX-NEXT: vorps %ymm4, %ymm2, %ymm2 5897; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] 5898; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 5899; AVX-NEXT: vmovaps %ymm2, 96(%rax) 5900; AVX-NEXT: vmovaps %ymm5, 64(%rax) 5901; AVX-NEXT: vmovaps %ymm0, 160(%rax) 5902; AVX-NEXT: vmovaps %ymm1, 128(%rax) 5903; AVX-NEXT: vmovaps %ymm10, 224(%rax) 5904; AVX-NEXT: vmovaps %ymm11, 192(%rax) 5905; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5906; AVX-NEXT: vmovaps %ymm0, 288(%rax) 5907; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5908; AVX-NEXT: vmovaps %ymm0, 256(%rax) 5909; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5910; AVX-NEXT: vmovaps %ymm0, 352(%rax) 5911; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5912; AVX-NEXT: vmovaps %ymm0, 320(%rax) 5913; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5914; AVX-NEXT: vmovaps %ymm0, 416(%rax) 5915; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5916; AVX-NEXT: vmovaps %ymm0, 384(%rax) 5917; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5918; AVX-NEXT: vmovaps %ymm0, 480(%rax) 5919; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5920; AVX-NEXT: vmovaps %ymm0, 448(%rax) 5921; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5922; AVX-NEXT: vmovaps %ymm0, (%rax) 5923; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5924; AVX-NEXT: vmovaps %ymm0, 32(%rax) 5925; AVX-NEXT: addq $328, %rsp # imm = 0x148 5926; AVX-NEXT: vzeroupper 5927; AVX-NEXT: retq 5928; 5929; AVX2-LABEL: store_i8_stride8_vf64: 5930; AVX2: # %bb.0: 5931; AVX2-NEXT: subq $328, %rsp # imm = 0x148 5932; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 5933; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 5934; AVX2-NEXT: vmovdqa (%rsi), %xmm0 5935; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5936; AVX2-NEXT: vmovdqa (%rdi), %xmm1 5937; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5938; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 5939; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 5940; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 5941; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] 5942; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 5943; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 5944; AVX2-NEXT: vmovdqa (%rcx), %xmm1 5945; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5946; AVX2-NEXT: vmovdqa (%rdx), %xmm3 5947; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5948; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 5949; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5] 5950; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] 5951; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 5952; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] 5953; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] 5954; AVX2-NEXT: vmovdqa (%r10), %xmm0 5955; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5956; AVX2-NEXT: vmovdqa (%rax), %xmm1 5957; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5958; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 5959; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,4,6,5] 5960; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,6,6,7] 5961; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5962; AVX2-NEXT: vmovdqa (%r9), %xmm0 5963; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5964; AVX2-NEXT: vmovdqa (%r8), %xmm6 5965; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5966; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] 5967; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] 5968; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,5,7,7] 5969; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 5970; AVX2-NEXT: vmovdqa 48(%rsi), %xmm0 5971; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm1[5],ymm7[6],ymm1[7],ymm7[8,9,10,11,12],ymm1[13],ymm7[14],ymm1[15] 5972; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 5973; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] 5974; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] 5975; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5976; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 5977; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 5978; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 5979; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm5 5980; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] 5981; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 5982; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] 5983; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 5984; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm3 5985; AVX2-NEXT: vmovdqa 48(%rcx), %xmm2 5986; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] 5987; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] 5988; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] 5989; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 5990; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7] 5991; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7] 5992; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 5993; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5,6,7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13,14,15] 5994; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] 5995; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] 5996; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5997; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 5998; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero 5999; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,1,1,1] 6000; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 6001; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm4 6002; AVX2-NEXT: vmovdqa 48(%rdx), %xmm3 6003; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 6004; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7] 6005; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero 6006; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,2,2,3,4,5,6,7] 6007; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero 6008; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 6009; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] 6010; AVX2-NEXT: vmovdqa 48(%r10), %xmm4 6011; AVX2-NEXT: vmovdqa 48(%rax), %xmm5 6012; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 6013; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] 6014; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7] 6015; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm12 6016; AVX2-NEXT: vmovdqa 48(%r9), %xmm6 6017; AVX2-NEXT: vmovdqa 48(%r8), %xmm7 6018; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] 6019; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[0,1,1,3,4,5,6,7] 6020; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm14[2,1,3,3,4,5,6,7] 6021; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm15, %ymm13 6022; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5,6,7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13,14,15] 6023; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] 6024; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] 6025; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6026; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] 6027; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero 6028; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,3,3,3] 6029; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero 6030; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 6031; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,4,6,5] 6032; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] 6033; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 6034; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] 6035; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7,8],ymm9[9],ymm8[10,11,12],ymm9[13],ymm8[14,15] 6036; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,4,6,5] 6037; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,6,6,7] 6038; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 6039; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm14[0,1,2,3,4,5,5,7] 6040; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm14[0,1,2,3,6,5,7,7] 6041; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 6042; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6],ymm9[7],ymm10[8,9,10,11,12],ymm9[13],ymm10[14],ymm9[15] 6043; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] 6044; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] 6045; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6046; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 6047; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 6048; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 6049; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 6050; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1 6051; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 6052; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] 6053; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 6054; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] 6055; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero 6056; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 6057; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15] 6058; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 6059; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] 6060; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] 6061; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 6062; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 6063; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7] 6064; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,1,3,3,4,5,6,7] 6065; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 6066; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5,6,7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13,14,15] 6067; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] 6068; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] 6069; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6070; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6071; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 6072; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 6073; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 6074; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 6075; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5] 6076; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 6077; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 6078; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] 6079; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] 6080; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] 6081; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] 6082; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 6083; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,7] 6084; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7] 6085; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 6086; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6],ymm0[7],ymm2[8,9,10,11,12],ymm0[13],ymm2[14],ymm0[15] 6087; AVX2-NEXT: vmovdqa 32(%rsi), %xmm0 6088; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 6089; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] 6090; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6091; AVX2-NEXT: vmovdqa 32(%rdi), %xmm1 6092; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 6093; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,1,1] 6094; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 6095; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero 6096; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm4 6097; AVX2-NEXT: vmovdqa 32(%rcx), %xmm2 6098; AVX2-NEXT: vmovdqa 32(%rdx), %xmm3 6099; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 6100; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7] 6101; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero 6102; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,2,2,3,4,5,6,7] 6103; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero 6104; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 6105; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] 6106; AVX2-NEXT: vmovdqa 32(%r10), %xmm4 6107; AVX2-NEXT: vmovdqa 32(%rax), %xmm5 6108; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 6109; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] 6110; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7] 6111; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm12 6112; AVX2-NEXT: vmovdqa 32(%r9), %xmm6 6113; AVX2-NEXT: vmovdqa 32(%r8), %xmm7 6114; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] 6115; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,1,1,3,4,5,6,7] 6116; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[2,1,3,3,4,5,6,7] 6117; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 6118; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5,6,7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13,14,15] 6119; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] 6120; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] 6121; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6122; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] 6123; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero 6124; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,3,3,3] 6125; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero 6126; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 6127; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,4,6,5] 6128; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] 6129; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 6130; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] 6131; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7,8],ymm9[9],ymm8[10,11,12],ymm9[13],ymm8[14,15] 6132; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,4,6,5] 6133; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,6,6,7] 6134; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 6135; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,5,5,7] 6136; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,6,5,7,7] 6137; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 6138; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6],ymm9[7],ymm10[8,9,10,11,12],ymm9[13],ymm10[14],ymm9[15] 6139; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] 6140; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] 6141; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6142; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 6143; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 6144; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 6145; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 6146; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1 6147; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 6148; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] 6149; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 6150; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] 6151; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero 6152; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 6153; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15] 6154; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 6155; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] 6156; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] 6157; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 6158; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 6159; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7] 6160; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,1,3,3,4,5,6,7] 6161; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 6162; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5,6,7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13,14,15] 6163; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] 6164; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] 6165; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6166; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6167; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 6168; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 6169; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 6170; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 6171; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5] 6172; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 6173; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 6174; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] 6175; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] 6176; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5] 6177; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] 6178; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 6179; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,7] 6180; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7] 6181; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 6182; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6],ymm1[7],ymm2[8,9,10,11,12],ymm1[13],ymm2[14],ymm1[15] 6183; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 6184; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 6185; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 6186; AVX2-NEXT: vmovdqa 16(%rsi), %xmm14 6187; AVX2-NEXT: vmovdqa 16(%rdi), %xmm12 6188; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] 6189; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] 6190; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 6191; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero 6192; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 6193; AVX2-NEXT: vmovdqa 16(%rcx), %xmm11 6194; AVX2-NEXT: vmovdqa 16(%rdx), %xmm9 6195; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] 6196; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] 6197; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 6198; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7] 6199; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 6200; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 6201; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] 6202; AVX2-NEXT: vmovdqa 16(%r10), %xmm7 6203; AVX2-NEXT: vmovdqa 16(%rax), %xmm6 6204; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] 6205; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,0,2,1,4,5,6,7] 6206; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,2,2,3,4,5,6,7] 6207; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 6208; AVX2-NEXT: vmovdqa 16(%r9), %xmm5 6209; AVX2-NEXT: vmovdqa 16(%r8), %xmm4 6210; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] 6211; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] 6212; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[2,1,3,3,4,5,6,7] 6213; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm2, %ymm2 6214; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5,6,7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13,14,15] 6215; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] 6216; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2],ymm2[3],ymm10[4],ymm2[5],ymm10[6],ymm2[7] 6217; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] 6218; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 6219; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,3,3,3] 6220; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 6221; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 6222; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] 6223; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 6224; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 6225; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] 6226; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] 6227; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,4,6,5] 6228; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,6,6,7] 6229; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 6230; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,7] 6231; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] 6232; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 6233; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7],ymm0[8,9,10,11,12],ymm2[13],ymm0[14],ymm2[15] 6234; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 6235; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 6236; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] 6237; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 6238; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 6239; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 6240; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 6241; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] 6242; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] 6243; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 6244; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] 6245; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero 6246; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3 6247; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] 6248; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] 6249; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] 6250; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] 6251; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 6252; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 6253; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] 6254; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,1,3,3,4,5,6,7] 6255; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 6256; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4,5,6,7,8],ymm6[9],ymm5[10],ymm6[11],ymm5[12,13,14,15] 6257; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] 6258; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4],ymm5[5],ymm0[6],ymm5[7] 6259; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] 6260; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero 6261; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 6262; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 6263; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 6264; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] 6265; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 6266; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 6267; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] 6268; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] 6269; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] 6270; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 6271; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 6272; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,7] 6273; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] 6274; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 6275; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6],ymm2[7],ymm3[8,9,10,11,12],ymm2[13],ymm3[14],ymm2[15] 6276; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 6277; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] 6278; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6279; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 6280; AVX2-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] 6281; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] 6282; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 6283; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 6284; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 6285; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 6286; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 6287; AVX2-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] 6288; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] 6289; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero 6290; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,2,2,3,4,5,6,7] 6291; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero 6292; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 6293; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] 6294; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6295; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload 6296; AVX2-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] 6297; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] 6298; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] 6299; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 6300; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 6301; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload 6302; AVX2-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] 6303; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,1,3,4,5,6,7] 6304; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[2,1,3,3,4,5,6,7] 6305; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm9, %ymm9 6306; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4,5,6,7,8],ymm6[9],ymm9[10],ymm6[11],ymm9[12,13,14,15] 6307; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] 6308; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7] 6309; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] 6310; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero 6311; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 6312; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 6313; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 6314; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,4,6,5] 6315; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] 6316; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 6317; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] 6318; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] 6319; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,4,6,5] 6320; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] 6321; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 6322; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5,5,7] 6323; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,7,7] 6324; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 6325; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6],ymm4[7],ymm5[8,9,10,11,12],ymm4[13],ymm5[14],ymm4[15] 6326; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] 6327; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7] 6328; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 6329; AVX2-NEXT: vmovdqa %ymm2, 96(%rax) 6330; AVX2-NEXT: vmovdqa %ymm3, 64(%rax) 6331; AVX2-NEXT: vmovdqa %ymm1, 160(%rax) 6332; AVX2-NEXT: vmovdqa %ymm0, 128(%rax) 6333; AVX2-NEXT: vmovdqa %ymm8, 224(%rax) 6334; AVX2-NEXT: vmovdqa %ymm10, 192(%rax) 6335; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 6336; AVX2-NEXT: vmovaps %ymm0, 288(%rax) 6337; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6338; AVX2-NEXT: vmovaps %ymm0, 256(%rax) 6339; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6340; AVX2-NEXT: vmovaps %ymm0, 352(%rax) 6341; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6342; AVX2-NEXT: vmovaps %ymm0, 320(%rax) 6343; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6344; AVX2-NEXT: vmovaps %ymm0, 416(%rax) 6345; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6346; AVX2-NEXT: vmovaps %ymm0, 384(%rax) 6347; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6348; AVX2-NEXT: vmovaps %ymm0, 480(%rax) 6349; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6350; AVX2-NEXT: vmovaps %ymm0, 448(%rax) 6351; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6352; AVX2-NEXT: vmovaps %ymm0, (%rax) 6353; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6354; AVX2-NEXT: vmovaps %ymm0, 32(%rax) 6355; AVX2-NEXT: addq $328, %rsp # imm = 0x148 6356; AVX2-NEXT: vzeroupper 6357; AVX2-NEXT: retq 6358; 6359; AVX2-FP-LABEL: store_i8_stride8_vf64: 6360; AVX2-FP: # %bb.0: 6361; AVX2-FP-NEXT: subq $392, %rsp # imm = 0x188 6362; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6363; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 6364; AVX2-FP-NEXT: vmovdqa (%r10), %xmm1 6365; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6366; AVX2-FP-NEXT: vmovdqa (%rax), %xmm0 6367; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6368; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 6369; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6370; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] 6371; AVX2-FP-NEXT: vmovdqa (%r9), %xmm3 6372; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6373; AVX2-FP-NEXT: vmovdqa (%r8), %xmm1 6374; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6375; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 6376; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 6377; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] 6378; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] 6379; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm4 6380; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6381; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2 6382; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6383; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 6384; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 6385; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm5 6386; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6387; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm4 6388; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6389; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 6390; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 6391; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] 6392; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] 6393; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] 6394; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] 6395; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6396; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] 6397; AVX2-FP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 6398; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] 6399; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 6400; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] 6401; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm13 = [1284,1798] 6402; AVX2-FP-NEXT: vpshufb %xmm13, %xmm4, %xmm1 6403; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 6404; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 6405; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] 6406; AVX2-FP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 6407; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] 6408; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 6409; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6410; AVX2-FP-NEXT: vmovdqa 48(%r10), %xmm8 6411; AVX2-FP-NEXT: vmovdqa 48(%rax), %xmm3 6412; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] 6413; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 6414; AVX2-FP-NEXT: vmovdqa 48(%r9), %xmm4 6415; AVX2-FP-NEXT: vmovdqa 48(%r8), %xmm5 6416; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 6417; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 6418; AVX2-FP-NEXT: vpshufb %ymm9, %ymm6, %ymm0 6419; AVX2-FP-NEXT: vpshufb %ymm10, %ymm7, %ymm2 6420; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] 6421; AVX2-FP-NEXT: vmovdqa 48(%rsi), %xmm10 6422; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm9 6423; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] 6424; AVX2-FP-NEXT: vpshufb %xmm13, %xmm0, %xmm1 6425; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 6426; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm13, %ymm1 6427; AVX2-FP-NEXT: vmovdqa 48(%rcx), %xmm13 6428; AVX2-FP-NEXT: vmovdqa 48(%rdx), %xmm2 6429; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] 6430; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 6431; AVX2-FP-NEXT: vpshufb %ymm12, %ymm15, %ymm14 6432; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] 6433; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4],ymm11[5],ymm1[6],ymm11[7] 6434; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6435; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] 6436; AVX2-FP-NEXT: vpshufb %ymm12, %ymm6, %ymm1 6437; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] 6438; AVX2-FP-NEXT: vpshufb %ymm11, %ymm7, %ymm6 6439; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] 6440; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] 6441; AVX2-FP-NEXT: vpshufb %ymm7, %ymm15, %ymm6 6442; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6443; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854] 6444; AVX2-FP-NEXT: vpshufb %ymm14, %ymm0, %ymm0 6445; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] 6446; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 6447; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6448; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] 6449; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 6450; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6451; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 6452; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] 6453; AVX2-FP-NEXT: vpshufb %ymm15, %ymm0, %ymm3 6454; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] 6455; AVX2-FP-NEXT: vpshufb %ymm8, %ymm1, %ymm4 6456; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] 6457; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] 6458; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] 6459; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm10 = [1284,1798] 6460; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm5 6461; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 6462; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 6463; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 6464; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] 6465; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] 6466; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] 6467; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6468; AVX2-FP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 6469; AVX2-FP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 6470; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] 6471; AVX2-FP-NEXT: vpshufb %ymm7, %ymm2, %ymm1 6472; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2 6473; AVX2-FP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 6474; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] 6475; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 6476; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6477; AVX2-FP-NEXT: vmovdqa 32(%r10), %xmm1 6478; AVX2-FP-NEXT: vmovdqa 32(%rax), %xmm3 6479; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 6480; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 6481; AVX2-FP-NEXT: vmovdqa 32(%r9), %xmm4 6482; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm5 6483; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 6484; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 6485; AVX2-FP-NEXT: vpshufb %ymm15, %ymm6, %ymm0 6486; AVX2-FP-NEXT: vpshufb %ymm8, %ymm7, %ymm2 6487; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] 6488; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm0 6489; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2 6490; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 6491; AVX2-FP-NEXT: vpshufb %xmm10, %xmm9, %xmm10 6492; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero 6493; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 6494; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm11 6495; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm13 6496; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] 6497; AVX2-FP-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 6498; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] 6499; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7,8],ymm15[9],ymm10[10,11,12],ymm15[13],ymm10[14,15] 6500; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] 6501; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6502; AVX2-FP-NEXT: vmovdqa %ymm12, %ymm10 6503; AVX2-FP-NEXT: vpshufb %ymm12, %ymm6, %ymm6 6504; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] 6505; AVX2-FP-NEXT: vpshufb %ymm15, %ymm7, %ymm7 6506; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] 6507; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] 6508; AVX2-FP-NEXT: vpshufb %ymm12, %ymm14, %ymm7 6509; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 6510; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854] 6511; AVX2-FP-NEXT: vpshufb %ymm14, %ymm8, %ymm8 6512; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] 6513; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] 6514; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6515; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 6516; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 6517; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 6518; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 6519; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] 6520; AVX2-FP-NEXT: vpshufb %ymm9, %ymm1, %ymm4 6521; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] 6522; AVX2-FP-NEXT: vpshufb %ymm7, %ymm3, %ymm5 6523; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] 6524; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] 6525; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 6526; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm11 = [1284,1798] 6527; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm2 6528; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 6529; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 6530; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 6531; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] 6532; AVX2-FP-NEXT: vpshufb %ymm6, %ymm5, %ymm6 6533; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7,8],ymm6[9],ymm2[10,11,12],ymm6[13],ymm2[14,15] 6534; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7] 6535; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6536; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 6537; AVX2-FP-NEXT: vpshufb %ymm15, %ymm3, %ymm2 6538; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] 6539; AVX2-FP-NEXT: vpshufb %ymm12, %ymm5, %ymm2 6540; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6541; AVX2-FP-NEXT: vpshufb %ymm14, %ymm0, %ymm0 6542; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] 6543; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 6544; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6545; AVX2-FP-NEXT: vmovdqa 16(%r10), %xmm4 6546; AVX2-FP-NEXT: vmovdqa 16(%rax), %xmm2 6547; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 6548; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 6549; AVX2-FP-NEXT: vmovdqa 16(%r9), %xmm1 6550; AVX2-FP-NEXT: vmovdqa 16(%r8), %xmm0 6551; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 6552; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm8 6553; AVX2-FP-NEXT: vpshufb %ymm9, %ymm6, %ymm3 6554; AVX2-FP-NEXT: vpshufb %ymm7, %ymm8, %ymm5 6555; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] 6556; AVX2-FP-NEXT: vmovdqa 16(%rsi), %xmm5 6557; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm3 6558; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] 6559; AVX2-FP-NEXT: vpshufb %xmm11, %xmm9, %xmm10 6560; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero 6561; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 6562; AVX2-FP-NEXT: vmovdqa 16(%rcx), %xmm13 6563; AVX2-FP-NEXT: vmovdqa 16(%rdx), %xmm14 6564; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] 6565; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 6566; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] 6567; AVX2-FP-NEXT: vpshufb %ymm11, %ymm15, %ymm11 6568; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7,8],ymm11[9],ymm10[10,11,12],ymm11[13],ymm10[14,15] 6569; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2],ymm7[3],ymm10[4],ymm7[5],ymm10[6],ymm7[7] 6570; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6571; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] 6572; AVX2-FP-NEXT: vpshufb %ymm12, %ymm6, %ymm6 6573; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] 6574; AVX2-FP-NEXT: vpshufb %ymm7, %ymm8, %ymm8 6575; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15] 6576; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] 6577; AVX2-FP-NEXT: vpshufb %ymm11, %ymm15, %ymm8 6578; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9 6579; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm10 = [2312,2826,3340,3854] 6580; AVX2-FP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 6581; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] 6582; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] 6583; AVX2-FP-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill 6584; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 6585; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 6586; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm1 6587; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 6588; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] 6589; AVX2-FP-NEXT: vpshufb %ymm6, %ymm1, %ymm0 6590; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] 6591; AVX2-FP-NEXT: vpshufb %ymm9, %ymm2, %ymm4 6592; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] 6593; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 6594; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 6595; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm14 = [1284,1798] 6596; AVX2-FP-NEXT: vpshufb %xmm14, %xmm3, %xmm5 6597; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 6598; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 6599; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 6600; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] 6601; AVX2-FP-NEXT: vpshufb %ymm13, %ymm4, %ymm8 6602; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15] 6603; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] 6604; AVX2-FP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 6605; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm15 6606; AVX2-FP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 6607; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] 6608; AVX2-FP-NEXT: vpshufb %ymm11, %ymm4, %ymm2 6609; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm7 6610; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 6611; AVX2-FP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 6612; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm11 6613; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15] 6614; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] 6615; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6616; AVX2-FP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 6617; AVX2-FP-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] 6618; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 6619; AVX2-FP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 6620; AVX2-FP-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] 6621; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 6622; AVX2-FP-NEXT: vpshufb %ymm6, %ymm2, %ymm4 6623; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 6624; AVX2-FP-NEXT: vpshufb %ymm9, %ymm3, %ymm5 6625; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] 6626; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6627; AVX2-FP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload 6628; AVX2-FP-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] 6629; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 6630; AVX2-FP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload 6631; AVX2-FP-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] 6632; AVX2-FP-NEXT: vpshufb %xmm14, %xmm8, %xmm9 6633; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero 6634; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 6635; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 6636; AVX2-FP-NEXT: vpshufb %ymm13, %ymm5, %ymm10 6637; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] 6638; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] 6639; AVX2-FP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 6640; AVX2-FP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 6641; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] 6642; AVX2-FP-NEXT: vpshufb %ymm7, %ymm5, %ymm3 6643; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm5 6644; AVX2-FP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 6645; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] 6646; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] 6647; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6648; AVX2-FP-NEXT: vmovdqa %ymm2, 96(%rax) 6649; AVX2-FP-NEXT: vmovdqa %ymm4, 64(%rax) 6650; AVX2-FP-NEXT: vmovdqa %ymm1, 160(%rax) 6651; AVX2-FP-NEXT: vmovdqa %ymm0, 128(%rax) 6652; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 6653; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax) 6654; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6655; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rax) 6656; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6657; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rax) 6658; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6659; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rax) 6660; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6661; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rax) 6662; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6663; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rax) 6664; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6665; AVX2-FP-NEXT: vmovaps %ymm0, 416(%rax) 6666; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6667; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rax) 6668; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6669; AVX2-FP-NEXT: vmovaps %ymm0, 480(%rax) 6670; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6671; AVX2-FP-NEXT: vmovaps %ymm0, 448(%rax) 6672; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6673; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) 6674; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6675; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) 6676; AVX2-FP-NEXT: addq $392, %rsp # imm = 0x188 6677; AVX2-FP-NEXT: vzeroupper 6678; AVX2-FP-NEXT: retq 6679; 6680; AVX2-FCP-LABEL: store_i8_stride8_vf64: 6681; AVX2-FCP: # %bb.0: 6682; AVX2-FCP-NEXT: subq $392, %rsp # imm = 0x188 6683; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6684; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 6685; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm1 6686; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6687; AVX2-FCP-NEXT: vmovdqa (%rax), %xmm0 6688; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6689; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 6690; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6691; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] 6692; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm3 6693; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6694; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm1 6695; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6696; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 6697; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 6698; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] 6699; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] 6700; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm4 6701; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6702; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2 6703; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6704; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 6705; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 6706; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm5 6707; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6708; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm4 6709; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6710; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 6711; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 6712; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] 6713; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] 6714; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] 6715; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] 6716; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6717; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] 6718; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 6719; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] 6720; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 6721; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] 6722; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm13 = [1284,1798] 6723; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm1 6724; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 6725; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 6726; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] 6727; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 6728; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] 6729; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 6730; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6731; AVX2-FCP-NEXT: vmovdqa 48(%r10), %xmm8 6732; AVX2-FCP-NEXT: vmovdqa 48(%rax), %xmm3 6733; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] 6734; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 6735; AVX2-FCP-NEXT: vmovdqa 48(%r9), %xmm4 6736; AVX2-FCP-NEXT: vmovdqa 48(%r8), %xmm5 6737; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 6738; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 6739; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm0 6740; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm2 6741; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] 6742; AVX2-FCP-NEXT: vmovdqa 48(%rsi), %xmm10 6743; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm9 6744; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] 6745; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm1 6746; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 6747; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm13, %ymm1 6748; AVX2-FCP-NEXT: vmovdqa 48(%rcx), %xmm13 6749; AVX2-FCP-NEXT: vmovdqa 48(%rdx), %xmm2 6750; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] 6751; AVX2-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 6752; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm14 6753; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] 6754; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4],ymm11[5],ymm1[6],ymm11[7] 6755; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6756; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] 6757; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm1 6758; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] 6759; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm6 6760; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] 6761; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] 6762; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm6 6763; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6764; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854] 6765; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm0 6766; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] 6767; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 6768; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6769; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] 6770; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 6771; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6772; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 6773; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] 6774; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm3 6775; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] 6776; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm4 6777; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] 6778; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] 6779; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] 6780; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm10 = [1284,1798] 6781; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm5 6782; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 6783; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 6784; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 6785; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] 6786; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] 6787; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] 6788; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6789; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 6790; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 6791; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] 6792; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1 6793; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2 6794; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 6795; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] 6796; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 6797; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6798; AVX2-FCP-NEXT: vmovdqa 32(%r10), %xmm1 6799; AVX2-FCP-NEXT: vmovdqa 32(%rax), %xmm3 6800; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 6801; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 6802; AVX2-FCP-NEXT: vmovdqa 32(%r9), %xmm4 6803; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm5 6804; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 6805; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 6806; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm0 6807; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm2 6808; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] 6809; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm0 6810; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 6811; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 6812; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm10 6813; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero 6814; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 6815; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm11 6816; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm13 6817; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] 6818; AVX2-FCP-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 6819; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] 6820; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7,8],ymm15[9],ymm10[10,11,12],ymm15[13],ymm10[14,15] 6821; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] 6822; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6823; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm10 6824; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm6 6825; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] 6826; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm7 6827; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] 6828; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] 6829; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm7 6830; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 6831; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854] 6832; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm8 6833; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] 6834; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] 6835; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6836; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 6837; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 6838; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 6839; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 6840; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] 6841; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm4 6842; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] 6843; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm5 6844; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] 6845; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] 6846; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 6847; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm11 = [1284,1798] 6848; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm2 6849; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 6850; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 6851; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 6852; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] 6853; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm6 6854; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7,8],ymm6[9],ymm2[10,11,12],ymm6[13],ymm2[14,15] 6855; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7] 6856; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6857; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 6858; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm2 6859; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] 6860; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm2 6861; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6862; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm0 6863; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] 6864; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 6865; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6866; AVX2-FCP-NEXT: vmovdqa 16(%r10), %xmm4 6867; AVX2-FCP-NEXT: vmovdqa 16(%rax), %xmm2 6868; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 6869; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 6870; AVX2-FCP-NEXT: vmovdqa 16(%r9), %xmm1 6871; AVX2-FCP-NEXT: vmovdqa 16(%r8), %xmm0 6872; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 6873; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm8 6874; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm3 6875; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm5 6876; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] 6877; AVX2-FCP-NEXT: vmovdqa 16(%rsi), %xmm5 6878; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 6879; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] 6880; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm10 6881; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero 6882; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 6883; AVX2-FCP-NEXT: vmovdqa 16(%rcx), %xmm13 6884; AVX2-FCP-NEXT: vmovdqa 16(%rdx), %xmm14 6885; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] 6886; AVX2-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 6887; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] 6888; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm11 6889; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7,8],ymm11[9],ymm10[10,11,12],ymm11[13],ymm10[14,15] 6890; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2],ymm7[3],ymm10[4],ymm7[5],ymm10[6],ymm7[7] 6891; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6892; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] 6893; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm6 6894; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] 6895; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm8 6896; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15] 6897; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] 6898; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm8 6899; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9 6900; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm10 = [2312,2826,3340,3854] 6901; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 6902; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] 6903; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] 6904; AVX2-FCP-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill 6905; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 6906; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 6907; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm1 6908; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 6909; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] 6910; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm0 6911; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] 6912; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm4 6913; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] 6914; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 6915; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 6916; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm14 = [1284,1798] 6917; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm5 6918; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 6919; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 6920; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 6921; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] 6922; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm8 6923; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15] 6924; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] 6925; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 6926; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm15 6927; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 6928; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] 6929; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm2 6930; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm7 6931; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 6932; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 6933; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm11 6934; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15] 6935; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] 6936; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6937; AVX2-FCP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 6938; AVX2-FCP-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] 6939; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 6940; AVX2-FCP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 6941; AVX2-FCP-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] 6942; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 6943; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm4 6944; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 6945; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm5 6946; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] 6947; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6948; AVX2-FCP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload 6949; AVX2-FCP-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] 6950; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 6951; AVX2-FCP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload 6952; AVX2-FCP-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] 6953; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm9 6954; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero 6955; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 6956; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 6957; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm10 6958; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] 6959; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] 6960; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 6961; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 6962; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] 6963; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm3 6964; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm5 6965; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 6966; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] 6967; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] 6968; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6969; AVX2-FCP-NEXT: vmovdqa %ymm2, 96(%rax) 6970; AVX2-FCP-NEXT: vmovdqa %ymm4, 64(%rax) 6971; AVX2-FCP-NEXT: vmovdqa %ymm1, 160(%rax) 6972; AVX2-FCP-NEXT: vmovdqa %ymm0, 128(%rax) 6973; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 6974; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax) 6975; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6976; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rax) 6977; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6978; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rax) 6979; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6980; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rax) 6981; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6982; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%rax) 6983; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6984; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rax) 6985; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6986; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%rax) 6987; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6988; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rax) 6989; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6990; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%rax) 6991; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6992; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%rax) 6993; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6994; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) 6995; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6996; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) 6997; AVX2-FCP-NEXT: addq $392, %rsp # imm = 0x188 6998; AVX2-FCP-NEXT: vzeroupper 6999; AVX2-FCP-NEXT: retq 7000; 7001; AVX512-LABEL: store_i8_stride8_vf64: 7002; AVX512: # %bb.0: 7003; AVX512-NEXT: subq $680, %rsp # imm = 0x2A8 7004; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 7005; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 7006; AVX512-NEXT: vmovdqa (%rcx), %xmm1 7007; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7008; AVX512-NEXT: vmovdqa 32(%rcx), %xmm10 7009; AVX512-NEXT: vmovdqa 48(%rcx), %xmm2 7010; AVX512-NEXT: vmovdqa (%rdx), %xmm0 7011; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7012; AVX512-NEXT: vmovdqa 48(%rdx), %xmm3 7013; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 7014; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] 7015; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 7016; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 7017; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7018; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 7019; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 7020; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 7021; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7022; AVX512-NEXT: vmovdqa (%r10), %xmm1 7023; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7024; AVX512-NEXT: vmovdqa 48(%r10), %xmm4 7025; AVX512-NEXT: vmovdqa (%rax), %xmm0 7026; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7027; AVX512-NEXT: vmovdqa 48(%rax), %xmm5 7028; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 7029; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] 7030; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] 7031; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1 7032; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7033; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 7034; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 7035; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 7036; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7037; AVX512-NEXT: vmovdqa (%r9), %xmm1 7038; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7039; AVX512-NEXT: vmovdqa 48(%r9), %xmm7 7040; AVX512-NEXT: vmovdqa (%r8), %xmm0 7041; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7042; AVX512-NEXT: vmovdqa 48(%r8), %xmm12 7043; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 7044; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] 7045; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] 7046; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 7047; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7048; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] 7049; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] 7050; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 7051; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7052; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 7053; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] 7054; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] 7055; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 7056; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7057; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 7058; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 7059; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm9 7060; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 7061; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] 7062; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] 7063; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm1, %ymm1 7064; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7065; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 7066; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 7067; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 7068; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7069; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] 7070; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] 7071; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,1,3,3,4,5,6,7] 7072; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1 7073; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7074; AVX512-NEXT: vmovdqa 32(%rdx), %xmm1 7075; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,5,5,7] 7076; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] 7077; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0 7078; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 7079; AVX512-NEXT: vmovdqa 32(%r10), %xmm0 7080; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 7081; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] 7082; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,2,2,3,4,5,6,7] 7083; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm11 7084; AVX512-NEXT: vmovdqa 32(%rax), %xmm2 7085; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,4,4,6,5] 7086; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 7087; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm13, %ymm31 7088; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 7089; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] 7090; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] 7091; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm28 7092; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] 7093; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 7094; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm23 7095; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] 7096; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] 7097; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] 7098; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm21 7099; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] 7100; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] 7101; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm20 7102; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] 7103; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] 7104; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] 7105; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm30 7106; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] 7107; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 7108; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm29 7109; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 7110; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] 7111; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] 7112; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm24 7113; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] 7114; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 7115; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm22 7116; AVX512-NEXT: vmovdqa 32(%r9), %xmm3 7117; AVX512-NEXT: vmovdqa 32(%r8), %xmm5 7118; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 7119; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,1,1,3,4,5,6,7] 7120; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[2,1,3,3,4,5,6,7] 7121; AVX512-NEXT: vinserti32x4 $1, %xmm13, %ymm12, %ymm19 7122; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,5,5,7] 7123; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] 7124; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm12, %ymm18 7125; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] 7126; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,0,2,1,4,5,6,7] 7127; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7] 7128; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm4 7129; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7130; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,4,6,5] 7131; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 7132; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 7133; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7134; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 7135; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] 7136; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] 7137; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 7138; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7139; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 7140; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 7141; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 7142; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7143; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] 7144; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] 7145; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] 7146; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 7147; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7148; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] 7149; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] 7150; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 7151; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7152; AVX512-NEXT: vmovdqa 16(%rcx), %xmm14 7153; AVX512-NEXT: vmovdqa 16(%rdx), %xmm12 7154; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] 7155; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] 7156; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] 7157; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 7158; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7159; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 7160; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 7161; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 7162; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7163; AVX512-NEXT: vmovdqa 16(%r10), %xmm1 7164; AVX512-NEXT: vmovdqa 16(%rax), %xmm2 7165; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 7166; AVX512-NEXT: vmovdqa64 %xmm2, %xmm26 7167; AVX512-NEXT: vmovdqa64 %xmm1, %xmm17 7168; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] 7169; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] 7170; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 7171; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7172; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 7173; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 7174; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 7175; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7176; AVX512-NEXT: vmovdqa 16(%r9), %xmm0 7177; AVX512-NEXT: vmovdqa 16(%r8), %xmm15 7178; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] 7179; AVX512-NEXT: vmovdqa64 %xmm0, %xmm16 7180; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,1,1,3,4,5,6,7] 7181; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[2,1,3,3,4,5,6,7] 7182; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm0 7183; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7184; AVX512-NEXT: vmovdqa (%rsi), %xmm1 7185; AVX512-NEXT: vmovdqa (%rdi), %xmm2 7186; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 7187; AVX512-NEXT: vmovdqa64 %xmm2, %xmm25 7188; AVX512-NEXT: vmovdqa64 %xmm1, %xmm27 7189; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] 7190; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] 7191; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 7192; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 7193; AVX512-NEXT: vmovdqa 48(%rsi), %xmm2 7194; AVX512-NEXT: vmovdqa 48(%rdi), %xmm1 7195; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 7196; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 7197; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 7198; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] 7199; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 7200; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 7201; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero 7202; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 7203; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 7204; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 7205; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 7206; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 7207; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 7208; AVX512-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] 7209; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 7210; AVX512-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7] 7211; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm10 7212; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm13 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] 7213; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm13 & (zmm10 ^ zmm3)) 7214; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 7215; AVX512-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] 7216; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 7217; AVX512-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] 7218; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm4 7219; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 7220; AVX512-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] 7221; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 7222; AVX512-NEXT: # ymm5 = mem[0,2,2,3,4,6,6,7] 7223; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 7224; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] 7225; AVX512-NEXT: vpandnq %zmm4, %zmm3, %zmm4 7226; AVX512-NEXT: vpandq %zmm3, %zmm5, %zmm5 7227; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA 7228; AVX512-NEXT: kmovw %eax, %k1 7229; AVX512-NEXT: vpord %zmm4, %zmm5, %zmm10 {%k1} 7230; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 7231; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero 7232; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 7233; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 7234; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 7235; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 7236; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 7237; AVX512-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] 7238; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7] 7239; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9 7240; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm13 & (zmm9 ^ zmm0)) 7241; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 7242; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] 7243; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 7244; AVX512-NEXT: # ymm2 = mem[0,2,2,3,4,6,6,7] 7245; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 7246; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 7247; AVX512-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5] 7248; AVX512-NEXT: vpshufd $232, (%rsp), %ymm4 # 32-byte Folded Reload 7249; AVX512-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] 7250; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 7251; AVX512-NEXT: vpandnq %zmm0, %zmm3, %zmm0 7252; AVX512-NEXT: vpandq %zmm3, %zmm2, %zmm2 7253; AVX512-NEXT: vpord %zmm0, %zmm2, %zmm9 {%k1} 7254; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 7255; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 7256; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] 7257; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 7258; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 7259; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 7260; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 7261; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 7262; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 7263; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 7264; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7] 7265; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7] 7266; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm11 7267; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm13 & (zmm11 ^ zmm0)) 7268; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[0,0,2,1,4,4,6,5] 7269; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] 7270; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 7271; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[0,0,2,1,4,4,6,5] 7272; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,2,2,3,4,6,6,7] 7273; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 7274; AVX512-NEXT: vpandnq %zmm0, %zmm3, %zmm0 7275; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 7276; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm11 {%k1} 7277; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] 7278; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,7,7] 7279; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 7280; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7281; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] 7282; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] 7283; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] 7284; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm28 7285; AVX512-NEXT: vmovdqa 32(%rsi), %xmm5 7286; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 7287; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 7288; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 7289; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 7290; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] 7291; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero 7292; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 7293; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 7294; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 7295; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 7296; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 7297; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 7298; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7] 7299; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[2,1,3,3,6,5,7,7] 7300; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 7301; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm13 & (zmm4 ^ zmm0)) 7302; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5] 7303; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[0,2,2,3,4,6,6,7] 7304; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 7305; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,0,2,1,4,4,6,5] 7306; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm18[0,2,2,3,4,6,6,7] 7307; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 7308; AVX512-NEXT: vpandnq %zmm0, %zmm3, %zmm0 7309; AVX512-NEXT: vpandq %zmm3, %zmm6, %zmm6 7310; AVX512-NEXT: vpord %zmm0, %zmm6, %zmm4 {%k1} 7311; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5] 7312; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 7313; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm18 7314; AVX512-NEXT: vmovdqa64 %xmm26, %xmm0 7315; AVX512-NEXT: vmovdqa64 %xmm17, %xmm2 7316; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 7317; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] 7318; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] 7319; AVX512-NEXT: vinserti32x4 $1, %xmm6, %ymm2, %ymm21 7320; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] 7321; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 7322; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm2, %ymm22 7323; AVX512-NEXT: vmovdqa64 %xmm16, %xmm0 7324; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] 7325; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7] 7326; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,3,3,4,5,6,7] 7327; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm2, %ymm26 7328; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,5,7] 7329; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] 7330; AVX512-NEXT: vinserti32x4 $1, %xmm6, %ymm7, %ymm19 7331; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7332; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload 7333; AVX512-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 7334; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] 7335; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7] 7336; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm14 7337; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5] 7338; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] 7339; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm15 7340; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7341; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload 7342; AVX512-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 7343; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] 7344; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7] 7345; AVX512-NEXT: vinserti32x4 $1, %xmm12, %ymm8, %ymm17 7346; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5] 7347; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] 7348; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm8, %ymm20 7349; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] 7350; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7351; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 7352; AVX512-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 7353; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,1,1,3,4,5,6,7] 7354; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,1,3,3,4,5,6,7] 7355; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm8 7356; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,5,7] 7357; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] 7358; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 7359; AVX512-NEXT: vmovdqa 16(%rsi), %xmm7 7360; AVX512-NEXT: vmovdqa 16(%rdi), %xmm12 7361; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] 7362; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] 7363; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[2,3,2,3] 7364; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero 7365; AVX512-NEXT: vpshufd {{.*#+}} xmm23 = xmm5[3,3,3,3] 7366; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero 7367; AVX512-NEXT: vinserti32x4 $1, %xmm23, %ymm12, %ymm12 7368; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero 7369; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] 7370; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero 7371; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm23, %ymm5 7372; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm12 7373; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 7374; AVX512-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] 7375; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload 7376; AVX512-NEXT: # ymm23 = mem[2,1,3,3,6,5,7,7] 7377; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5 7378; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm13 & (zmm5 ^ zmm12)) 7379; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload 7380; AVX512-NEXT: # ymm12 = mem[0,0,2,1,4,4,6,5] 7381; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload 7382; AVX512-NEXT: # ymm23 = mem[0,2,2,3,4,6,6,7] 7383; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm12 7384; AVX512-NEXT: vpandnq %zmm12, %zmm3, %zmm12 7385; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload 7386; AVX512-NEXT: # ymm23 = mem[0,0,2,1,4,4,6,5] 7387; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload 7388; AVX512-NEXT: # ymm24 = mem[0,2,2,3,4,6,6,7] 7389; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23 7390; AVX512-NEXT: vpandq %zmm3, %zmm23, %zmm23 7391; AVX512-NEXT: vpord %zmm12, %zmm23, %zmm5 {%k1} 7392; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,3,2,3] 7393; AVX512-NEXT: vpshufd {{.*#+}} xmm23 = xmm0[3,3,3,3] 7394; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 7395; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 7396; AVX512-NEXT: vmovdqa64 %xmm25, %xmm2 7397; AVX512-NEXT: vmovdqa64 %xmm27, %xmm6 7398; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 7399; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero 7400; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero 7401; AVX512-NEXT: vinserti32x4 $1, %xmm23, %ymm12, %ymm12 7402; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 7403; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm24, %ymm0 7404; AVX512-NEXT: vpshufd {{.*#+}} xmm23 = xmm7[2,3,2,3] 7405; AVX512-NEXT: vpshufd {{.*#+}} xmm24 = xmm7[3,3,3,3] 7406; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero 7407; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] 7408; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 7409; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload 7410; AVX512-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7] 7411; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload 7412; AVX512-NEXT: # ymm30 = mem[2,1,3,3,6,5,7,7] 7413; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm12, %zmm12 7414; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm13 & (zmm12 ^ zmm0)) 7415; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 7416; AVX512-NEXT: vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3] 7417; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 7418; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 7419; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload 7420; AVX512-NEXT: # ymm25 = mem[0,0,2,1,4,4,6,5] 7421; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload 7422; AVX512-NEXT: # ymm27 = mem[0,2,2,3,4,6,6,7] 7423; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 7424; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload 7425; AVX512-NEXT: # ymm27 = mem[0,0,2,1,4,4,6,5] 7426; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload 7427; AVX512-NEXT: # ymm16 = mem[0,2,2,3,4,6,6,7] 7428; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 7429; AVX512-NEXT: vpandnq %zmm25, %zmm3, %zmm25 7430; AVX512-NEXT: vpandq %zmm3, %zmm16, %zmm16 7431; AVX512-NEXT: vpord %zmm25, %zmm16, %zmm12 {%k1} 7432; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero 7433; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero 7434; AVX512-NEXT: vinserti32x4 $1, %xmm23, %ymm16, %ymm16 7435; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero 7436; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm29, %ymm7 7437; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 7438; AVX512-NEXT: vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7] 7439; AVX512-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7] 7440; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 7441; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm13 & (zmm16 ^ zmm7)) 7442; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[0,0,2,1,4,4,6,5] 7443; AVX512-NEXT: vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7] 7444; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm7, %zmm7 7445; AVX512-NEXT: vpshufd {{.*#+}} ymm18 = ymm26[0,0,2,1,4,4,6,5] 7446; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,2,2,3,4,6,6,7] 7447; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6 7448; AVX512-NEXT: vpandnq %zmm7, %zmm3, %zmm7 7449; AVX512-NEXT: vpandq %zmm3, %zmm6, %zmm6 7450; AVX512-NEXT: vpord %zmm7, %zmm6, %zmm16 {%k1} 7451; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 7452; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero 7453; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 7454; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 7455; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 7456; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 7457; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,5,7] 7458; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,1,3,3,6,5,7,7] 7459; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 7460; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm13 & (zmm2 ^ zmm0)) 7461; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,0,2,1,4,4,6,5] 7462; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,2,2,3,4,6,6,7] 7463; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 7464; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[0,0,2,1,4,4,6,5] 7465; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 7466; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 7467; AVX512-NEXT: vpandnq %zmm0, %zmm3, %zmm0 7468; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 7469; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm2 {%k1} 7470; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 7471; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) 7472; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rax) 7473; AVX512-NEXT: vmovdqa64 %zmm12, 128(%rax) 7474; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rax) 7475; AVX512-NEXT: vmovdqa64 %zmm4, 256(%rax) 7476; AVX512-NEXT: vmovdqa64 %zmm11, 448(%rax) 7477; AVX512-NEXT: vmovdqa64 %zmm9, 384(%rax) 7478; AVX512-NEXT: vmovdqa64 %zmm10, 64(%rax) 7479; AVX512-NEXT: addq $680, %rsp # imm = 0x2A8 7480; AVX512-NEXT: vzeroupper 7481; AVX512-NEXT: retq 7482; 7483; AVX512-FCP-LABEL: store_i8_stride8_vf64: 7484; AVX512-FCP: # %bb.0: 7485; AVX512-FCP-NEXT: subq $392, %rsp # imm = 0x188 7486; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 7487; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 7488; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm2 7489; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7490; AVX512-FCP-NEXT: vmovdqa 48(%rcx), %xmm0 7491; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm15 7492; AVX512-FCP-NEXT: vmovdqa 48(%rdx), %xmm1 7493; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] 7494; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 7495; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] 7496; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm3 7497; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] 7498; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 7499; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 7500; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7501; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm2 7502; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7503; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm14 7504; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] 7505; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 7506; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] 7507; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm3 7508; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] 7509; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2 7510; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm19 7511; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3 7512; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7513; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 7514; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7515; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 7516; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 7517; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] 7518; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3 7519; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] 7520; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 7521; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm21 7522; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 7523; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 7524; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm3 7525; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 7526; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 7527; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7528; AVX512-FCP-NEXT: vmovdqa 48(%r10), %xmm2 7529; AVX512-FCP-NEXT: vmovdqa 48(%rax), %xmm3 7530; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 7531; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 7532; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5 7533; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4 7534; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 7535; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7536; AVX512-FCP-NEXT: vmovdqa 48(%r9), %xmm4 7537; AVX512-FCP-NEXT: vmovdqa 48(%r8), %xmm5 7538; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 7539; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 7540; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm13 7541; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm7 7542; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 7543; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7544; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 7545; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7546; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1 7547; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 7548; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 7549; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7550; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 7551; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7552; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm1 7553; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 7554; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 7555; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 7556; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm0 7557; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm1 7558; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 7559; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 7560; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3 7561; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 7562; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 7563; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7564; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 7565; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 7566; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm3 7567; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 7568; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm20 7569; AVX512-FCP-NEXT: vmovdqa 32(%r10), %xmm2 7570; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm3 7571; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 7572; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 7573; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5 7574; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4 7575; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm22 7576; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm4 7577; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm5 7578; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 7579; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 7580; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm13 7581; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm7 7582; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm23 7583; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 7584; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7585; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1 7586; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 7587; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24 7588; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 7589; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7590; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm1 7591; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 7592; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 7593; AVX512-FCP-NEXT: vmovdqa 16(%rcx), %xmm0 7594; AVX512-FCP-NEXT: vmovdqa 16(%rdx), %xmm1 7595; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 7596; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 7597; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3 7598; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 7599; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm26 7600; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 7601; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 7602; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm3 7603; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 7604; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm27 7605; AVX512-FCP-NEXT: vmovdqa 16(%r10), %xmm2 7606; AVX512-FCP-NEXT: vmovdqa 16(%rax), %xmm3 7607; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 7608; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 7609; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5 7610; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4 7611; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm28 7612; AVX512-FCP-NEXT: vmovdqa 16(%r9), %xmm4 7613; AVX512-FCP-NEXT: vmovdqa 16(%r8), %xmm5 7614; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 7615; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 7616; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm13 7617; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm7 7618; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm29 7619; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 7620; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7621; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1 7622; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 7623; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm30 7624; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 7625; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7626; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm1 7627; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 7628; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm31 7629; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 7630; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7631; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm1 7632; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm17 7633; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 7634; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm18 7635; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 7636; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm12 7637; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 7638; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] 7639; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798] 7640; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 7641; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm2 7642; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 7643; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7644; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854] 7645; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 7646; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3 7647; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} zmm7 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] 7648; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload 7649; AVX512-FCP-NEXT: # zmm3 = mem ^ (zmm7 & (zmm3 ^ mem)) 7650; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] 7651; AVX512-FCP-NEXT: vpandnq %zmm19, %zmm2, %zmm19 7652; AVX512-FCP-NEXT: vpandq %zmm2, %zmm21, %zmm21 7653; AVX512-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA 7654; AVX512-FCP-NEXT: kmovw %eax, %k1 7655; AVX512-FCP-NEXT: vpord %zmm19, %zmm21, %zmm3 {%k1} 7656; AVX512-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload 7657; AVX512-FCP-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3],xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] 7658; AVX512-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 7659; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm6 7660; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm8 7661; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm19 7662; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm15 7663; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm8 7664; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] 7665; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm0 7666; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero 7667; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm21, %ymm0 7668; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 7669; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6 7670; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 7671; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15] 7672; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm8 7673; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 7674; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm15, %ymm8 7675; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7676; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 7677; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm15 7678; AVX512-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm0 # 16-byte Folded Reload 7679; AVX512-FCP-NEXT: # xmm0 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] 7680; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7681; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm8 7682; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 7683; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm13 7684; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm0 7685; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm8 7686; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] 7687; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm14 7688; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero 7689; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm14, %ymm21, %ymm14 7690; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 7691; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm11 7692; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11 7693; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] 7694; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm8 7695; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 7696; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm14, %ymm8 7697; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7698; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 7699; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm14 7700; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7701; AVX512-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 7702; AVX512-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 7703; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7704; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 7705; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm8 7706; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 7707; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 7708; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm21 7709; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm8 7710; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm10 7711; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] 7712; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm1 7713; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 7714; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm1, %ymm17, %ymm1 7715; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7716; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 7717; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 7718; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] 7719; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm8 7720; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 7721; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 7722; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 7723; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 7724; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 7725; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] 7726; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm9 7727; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 7728; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 7729; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero 7730; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 7731; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 7732; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload 7733; AVX512-FCP-NEXT: # zmm6 = mem ^ (zmm7 & (zmm6 ^ mem)) 7734; AVX512-FCP-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload 7735; AVX512-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload 7736; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm6 {%k1} 7737; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm15 # 64-byte Folded Reload 7738; AVX512-FCP-NEXT: # zmm15 = mem ^ (zmm7 & (zmm15 ^ mem)) 7739; AVX512-FCP-NEXT: vpandnq (%rsp), %zmm2, %zmm5 # 64-byte Folded Reload 7740; AVX512-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload 7741; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm15 {%k1} 7742; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm20 ^ (zmm7 & (zmm11 ^ zmm20)) 7743; AVX512-FCP-NEXT: vpandnq %zmm22, %zmm2, %zmm5 7744; AVX512-FCP-NEXT: vpandq %zmm2, %zmm23, %zmm8 7745; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm11 {%k1} 7746; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm24 ^ (zmm7 & (zmm14 ^ zmm24)) 7747; AVX512-FCP-NEXT: vpandnq %zmm25, %zmm2, %zmm5 7748; AVX512-FCP-NEXT: vpandq %zmm2, %zmm26, %zmm8 7749; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm14 {%k1} 7750; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm27 ^ (zmm7 & (zmm0 ^ zmm27)) 7751; AVX512-FCP-NEXT: vpandnq %zmm28, %zmm2, %zmm5 7752; AVX512-FCP-NEXT: vpandq %zmm2, %zmm29, %zmm8 7753; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm0 {%k1} 7754; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm30 ^ (zmm7 & (zmm1 ^ zmm30)) 7755; AVX512-FCP-NEXT: vpandnq %zmm31, %zmm2, %zmm5 7756; AVX512-FCP-NEXT: vpandq %zmm2, %zmm16, %zmm8 7757; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm1 {%k1} 7758; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm19 ^ (zmm7 & (zmm4 ^ zmm19)) 7759; AVX512-FCP-NEXT: vpandnq %zmm13, %zmm2, %zmm5 7760; AVX512-FCP-NEXT: vpandq %zmm2, %zmm21, %zmm2 7761; AVX512-FCP-NEXT: vpord %zmm5, %zmm2, %zmm4 {%k1} 7762; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 7763; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 7764; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) 7765; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) 7766; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 320(%rax) 7767; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) 7768; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 448(%rax) 7769; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) 7770; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) 7771; AVX512-FCP-NEXT: addq $392, %rsp # imm = 0x188 7772; AVX512-FCP-NEXT: vzeroupper 7773; AVX512-FCP-NEXT: retq 7774; 7775; AVX512DQ-LABEL: store_i8_stride8_vf64: 7776; AVX512DQ: # %bb.0: 7777; AVX512DQ-NEXT: subq $680, %rsp # imm = 0x2A8 7778; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 7779; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 7780; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm1 7781; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7782; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm10 7783; AVX512DQ-NEXT: vmovdqa 48(%rcx), %xmm2 7784; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm0 7785; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7786; AVX512DQ-NEXT: vmovdqa 48(%rdx), %xmm3 7787; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 7788; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] 7789; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 7790; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 7791; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7792; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 7793; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 7794; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 7795; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7796; AVX512DQ-NEXT: vmovdqa (%r10), %xmm1 7797; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7798; AVX512DQ-NEXT: vmovdqa 48(%r10), %xmm4 7799; AVX512DQ-NEXT: vmovdqa (%rax), %xmm0 7800; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7801; AVX512DQ-NEXT: vmovdqa 48(%rax), %xmm5 7802; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 7803; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] 7804; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] 7805; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1 7806; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7807; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 7808; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 7809; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 7810; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7811; AVX512DQ-NEXT: vmovdqa (%r9), %xmm1 7812; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7813; AVX512DQ-NEXT: vmovdqa 48(%r9), %xmm7 7814; AVX512DQ-NEXT: vmovdqa (%r8), %xmm0 7815; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7816; AVX512DQ-NEXT: vmovdqa 48(%r8), %xmm12 7817; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 7818; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] 7819; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] 7820; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 7821; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7822; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] 7823; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] 7824; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 7825; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7826; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 7827; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] 7828; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] 7829; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 7830; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7831; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 7832; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 7833; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm9 7834; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 7835; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] 7836; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] 7837; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm1, %ymm1 7838; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7839; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 7840; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 7841; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 7842; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7843; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] 7844; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] 7845; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,1,3,3,4,5,6,7] 7846; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1 7847; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7848; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm1 7849; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,5,5,7] 7850; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] 7851; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0 7852; AVX512DQ-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 7853; AVX512DQ-NEXT: vmovdqa 32(%r10), %xmm0 7854; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 7855; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] 7856; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,2,2,3,4,5,6,7] 7857; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm11 7858; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm2 7859; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,4,4,6,5] 7860; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 7861; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm13, %ymm31 7862; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 7863; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] 7864; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] 7865; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm28 7866; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] 7867; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 7868; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm23 7869; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] 7870; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] 7871; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] 7872; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm21 7873; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] 7874; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] 7875; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm20 7876; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] 7877; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] 7878; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] 7879; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm30 7880; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] 7881; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 7882; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm29 7883; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 7884; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] 7885; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] 7886; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm24 7887; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] 7888; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 7889; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm22 7890; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm3 7891; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm5 7892; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 7893; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,1,1,3,4,5,6,7] 7894; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[2,1,3,3,4,5,6,7] 7895; AVX512DQ-NEXT: vinserti32x4 $1, %xmm13, %ymm12, %ymm19 7896; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,5,5,7] 7897; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] 7898; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm12, %ymm18 7899; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] 7900; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,0,2,1,4,5,6,7] 7901; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7] 7902; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm4 7903; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7904; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,4,6,5] 7905; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 7906; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 7907; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7908; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 7909; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] 7910; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] 7911; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 7912; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7913; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 7914; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 7915; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 7916; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7917; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] 7918; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] 7919; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] 7920; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 7921; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7922; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] 7923; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] 7924; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 7925; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7926; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm14 7927; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm12 7928; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] 7929; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] 7930; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] 7931; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 7932; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7933; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 7934; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 7935; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 7936; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7937; AVX512DQ-NEXT: vmovdqa 16(%r10), %xmm1 7938; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm2 7939; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 7940; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm26 7941; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm17 7942; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] 7943; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] 7944; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 7945; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7946; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] 7947; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 7948; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 7949; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7950; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm0 7951; AVX512DQ-NEXT: vmovdqa 16(%r8), %xmm15 7952; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] 7953; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm16 7954; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,1,1,3,4,5,6,7] 7955; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[2,1,3,3,4,5,6,7] 7956; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm0 7957; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7958; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 7959; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 7960; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 7961; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm25 7962; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm27 7963; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] 7964; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] 7965; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 7966; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 7967; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm2 7968; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 7969; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 7970; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 7971; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 7972; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] 7973; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 7974; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 7975; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero 7976; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 7977; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 7978; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 7979; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 7980; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 7981; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 7982; AVX512DQ-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] 7983; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 7984; AVX512DQ-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7] 7985; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm10 7986; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm13 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] 7987; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm13 & (zmm10 ^ zmm3)) 7988; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 7989; AVX512DQ-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] 7990; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 7991; AVX512DQ-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] 7992; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm4 7993; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload 7994; AVX512DQ-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] 7995; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 7996; AVX512DQ-NEXT: # ymm5 = mem[0,2,2,3,4,6,6,7] 7997; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 7998; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] 7999; AVX512DQ-NEXT: vpandnq %zmm4, %zmm3, %zmm4 8000; AVX512DQ-NEXT: vpandq %zmm3, %zmm5, %zmm5 8001; AVX512DQ-NEXT: movw $-21846, %ax # imm = 0xAAAA 8002; AVX512DQ-NEXT: kmovw %eax, %k1 8003; AVX512DQ-NEXT: vpord %zmm4, %zmm5, %zmm10 {%k1} 8004; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 8005; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero 8006; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 8007; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 8008; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 8009; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 8010; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 8011; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] 8012; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7] 8013; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9 8014; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm13 & (zmm9 ^ zmm0)) 8015; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 8016; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] 8017; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 8018; AVX512DQ-NEXT: # ymm2 = mem[0,2,2,3,4,6,6,7] 8019; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 8020; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 8021; AVX512DQ-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5] 8022; AVX512DQ-NEXT: vpshufd $232, (%rsp), %ymm4 # 32-byte Folded Reload 8023; AVX512DQ-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] 8024; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 8025; AVX512DQ-NEXT: vpandnq %zmm0, %zmm3, %zmm0 8026; AVX512DQ-NEXT: vpandq %zmm3, %zmm2, %zmm2 8027; AVX512DQ-NEXT: vpord %zmm0, %zmm2, %zmm9 {%k1} 8028; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 8029; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 8030; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] 8031; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 8032; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 8033; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 8034; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 8035; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 8036; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 8037; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8038; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7] 8039; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7] 8040; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm11 8041; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm13 & (zmm11 ^ zmm0)) 8042; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[0,0,2,1,4,4,6,5] 8043; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] 8044; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 8045; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[0,0,2,1,4,4,6,5] 8046; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,2,2,3,4,6,6,7] 8047; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 8048; AVX512DQ-NEXT: vpandnq %zmm0, %zmm3, %zmm0 8049; AVX512DQ-NEXT: vpandq %zmm3, %zmm1, %zmm1 8050; AVX512DQ-NEXT: vpord %zmm0, %zmm1, %zmm11 {%k1} 8051; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] 8052; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,7,7] 8053; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 8054; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8055; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] 8056; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] 8057; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] 8058; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm28 8059; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm5 8060; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1 8061; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 8062; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 8063; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 8064; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] 8065; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero 8066; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 8067; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 8068; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 8069; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 8070; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 8071; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 8072; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7] 8073; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[2,1,3,3,6,5,7,7] 8074; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 8075; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm13 & (zmm4 ^ zmm0)) 8076; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5] 8077; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[0,2,2,3,4,6,6,7] 8078; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 8079; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,0,2,1,4,4,6,5] 8080; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm18[0,2,2,3,4,6,6,7] 8081; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 8082; AVX512DQ-NEXT: vpandnq %zmm0, %zmm3, %zmm0 8083; AVX512DQ-NEXT: vpandq %zmm3, %zmm6, %zmm6 8084; AVX512DQ-NEXT: vpord %zmm0, %zmm6, %zmm4 {%k1} 8085; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5] 8086; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 8087; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm18 8088; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm0 8089; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm2 8090; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 8091; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] 8092; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] 8093; AVX512DQ-NEXT: vinserti32x4 $1, %xmm6, %ymm2, %ymm21 8094; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] 8095; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 8096; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm2, %ymm22 8097; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm0 8098; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] 8099; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7] 8100; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,3,3,4,5,6,7] 8101; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm2, %ymm26 8102; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,5,7] 8103; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] 8104; AVX512DQ-NEXT: vinserti32x4 $1, %xmm6, %ymm7, %ymm19 8105; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8106; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload 8107; AVX512DQ-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 8108; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] 8109; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7] 8110; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm14 8111; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5] 8112; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] 8113; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm15 8114; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8115; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload 8116; AVX512DQ-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 8117; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] 8118; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7] 8119; AVX512DQ-NEXT: vinserti32x4 $1, %xmm12, %ymm8, %ymm17 8120; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5] 8121; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] 8122; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm8, %ymm20 8123; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] 8124; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8125; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 8126; AVX512DQ-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 8127; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,1,1,3,4,5,6,7] 8128; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,1,3,3,4,5,6,7] 8129; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm8 8130; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,5,7] 8131; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] 8132; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 8133; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm7 8134; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm12 8135; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] 8136; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] 8137; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[2,3,2,3] 8138; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero 8139; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm23 = xmm5[3,3,3,3] 8140; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero 8141; AVX512DQ-NEXT: vinserti32x4 $1, %xmm23, %ymm12, %ymm12 8142; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero 8143; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] 8144; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero 8145; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm23, %ymm5 8146; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm12 8147; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 8148; AVX512DQ-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] 8149; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload 8150; AVX512DQ-NEXT: # ymm23 = mem[2,1,3,3,6,5,7,7] 8151; AVX512DQ-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5 8152; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm13 & (zmm5 ^ zmm12)) 8153; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload 8154; AVX512DQ-NEXT: # ymm12 = mem[0,0,2,1,4,4,6,5] 8155; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload 8156; AVX512DQ-NEXT: # ymm23 = mem[0,2,2,3,4,6,6,7] 8157; AVX512DQ-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm12 8158; AVX512DQ-NEXT: vpandnq %zmm12, %zmm3, %zmm12 8159; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload 8160; AVX512DQ-NEXT: # ymm23 = mem[0,0,2,1,4,4,6,5] 8161; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload 8162; AVX512DQ-NEXT: # ymm24 = mem[0,2,2,3,4,6,6,7] 8163; AVX512DQ-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23 8164; AVX512DQ-NEXT: vpandq %zmm3, %zmm23, %zmm23 8165; AVX512DQ-NEXT: vpord %zmm12, %zmm23, %zmm5 {%k1} 8166; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,3,2,3] 8167; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm23 = xmm0[3,3,3,3] 8168; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 8169; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 8170; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm2 8171; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm6 8172; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 8173; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero 8174; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero 8175; AVX512DQ-NEXT: vinserti32x4 $1, %xmm23, %ymm12, %ymm12 8176; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 8177; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm24, %ymm0 8178; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm23 = xmm7[2,3,2,3] 8179; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm24 = xmm7[3,3,3,3] 8180; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero 8181; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] 8182; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 8183; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload 8184; AVX512DQ-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7] 8185; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload 8186; AVX512DQ-NEXT: # ymm30 = mem[2,1,3,3,6,5,7,7] 8187; AVX512DQ-NEXT: vinserti64x4 $1, %ymm30, %zmm12, %zmm12 8188; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm13 & (zmm12 ^ zmm0)) 8189; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 8190; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3] 8191; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 8192; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 8193; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload 8194; AVX512DQ-NEXT: # ymm25 = mem[0,0,2,1,4,4,6,5] 8195; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload 8196; AVX512DQ-NEXT: # ymm27 = mem[0,2,2,3,4,6,6,7] 8197; AVX512DQ-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 8198; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload 8199; AVX512DQ-NEXT: # ymm27 = mem[0,0,2,1,4,4,6,5] 8200; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload 8201; AVX512DQ-NEXT: # ymm16 = mem[0,2,2,3,4,6,6,7] 8202; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 8203; AVX512DQ-NEXT: vpandnq %zmm25, %zmm3, %zmm25 8204; AVX512DQ-NEXT: vpandq %zmm3, %zmm16, %zmm16 8205; AVX512DQ-NEXT: vpord %zmm25, %zmm16, %zmm12 {%k1} 8206; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero 8207; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero 8208; AVX512DQ-NEXT: vinserti32x4 $1, %xmm23, %ymm16, %ymm16 8209; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero 8210; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm29, %ymm7 8211; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 8212; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7] 8213; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7] 8214; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 8215; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm13 & (zmm16 ^ zmm7)) 8216; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[0,0,2,1,4,4,6,5] 8217; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7] 8218; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm7, %zmm7 8219; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm18 = ymm26[0,0,2,1,4,4,6,5] 8220; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,2,2,3,4,6,6,7] 8221; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6 8222; AVX512DQ-NEXT: vpandnq %zmm7, %zmm3, %zmm7 8223; AVX512DQ-NEXT: vpandq %zmm3, %zmm6, %zmm6 8224; AVX512DQ-NEXT: vpord %zmm7, %zmm6, %zmm16 {%k1} 8225; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 8226; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero 8227; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 8228; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 8229; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 8230; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 8231; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,5,7] 8232; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,1,3,3,6,5,7,7] 8233; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 8234; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm13 & (zmm2 ^ zmm0)) 8235; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,0,2,1,4,4,6,5] 8236; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,2,2,3,4,6,6,7] 8237; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 8238; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[0,0,2,1,4,4,6,5] 8239; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 8240; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 8241; AVX512DQ-NEXT: vpandnq %zmm0, %zmm3, %zmm0 8242; AVX512DQ-NEXT: vpandq %zmm3, %zmm1, %zmm1 8243; AVX512DQ-NEXT: vpord %zmm0, %zmm1, %zmm2 {%k1} 8244; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 8245; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) 8246; AVX512DQ-NEXT: vmovdqa64 %zmm16, 192(%rax) 8247; AVX512DQ-NEXT: vmovdqa64 %zmm12, 128(%rax) 8248; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rax) 8249; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rax) 8250; AVX512DQ-NEXT: vmovdqa64 %zmm11, 448(%rax) 8251; AVX512DQ-NEXT: vmovdqa64 %zmm9, 384(%rax) 8252; AVX512DQ-NEXT: vmovdqa64 %zmm10, 64(%rax) 8253; AVX512DQ-NEXT: addq $680, %rsp # imm = 0x2A8 8254; AVX512DQ-NEXT: vzeroupper 8255; AVX512DQ-NEXT: retq 8256; 8257; AVX512DQ-FCP-LABEL: store_i8_stride8_vf64: 8258; AVX512DQ-FCP: # %bb.0: 8259; AVX512DQ-FCP-NEXT: subq $392, %rsp # imm = 0x188 8260; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 8261; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 8262; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm2 8263; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8264; AVX512DQ-FCP-NEXT: vmovdqa 48(%rcx), %xmm0 8265; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm15 8266; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdx), %xmm1 8267; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] 8268; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 8269; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] 8270; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm3 8271; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] 8272; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 8273; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 8274; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8275; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm2 8276; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8277; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm14 8278; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] 8279; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 8280; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] 8281; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm3 8282; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] 8283; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2 8284; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm19 8285; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm3 8286; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8287; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 8288; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8289; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 8290; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 8291; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] 8292; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3 8293; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] 8294; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 8295; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm21 8296; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 8297; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 8298; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm3 8299; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 8300; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 8301; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8302; AVX512DQ-FCP-NEXT: vmovdqa 48(%r10), %xmm2 8303; AVX512DQ-FCP-NEXT: vmovdqa 48(%rax), %xmm3 8304; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 8305; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 8306; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5 8307; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4 8308; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 8309; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8310; AVX512DQ-FCP-NEXT: vmovdqa 48(%r9), %xmm4 8311; AVX512DQ-FCP-NEXT: vmovdqa 48(%r8), %xmm5 8312; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 8313; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 8314; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm13 8315; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm7 8316; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 8317; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8318; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 8319; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8320; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1 8321; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 8322; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 8323; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8324; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 8325; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8326; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm1 8327; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 8328; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 8329; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 8330; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm0 8331; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm1 8332; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 8333; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 8334; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3 8335; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 8336; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 8337; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8338; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 8339; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 8340; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm3 8341; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 8342; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm20 8343; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %xmm2 8344; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm3 8345; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 8346; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 8347; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5 8348; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4 8349; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm22 8350; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm4 8351; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm5 8352; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 8353; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 8354; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm13 8355; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm7 8356; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm23 8357; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 8358; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8359; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1 8360; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 8361; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24 8362; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 8363; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8364; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm1 8365; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 8366; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 8367; AVX512DQ-FCP-NEXT: vmovdqa 16(%rcx), %xmm0 8368; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdx), %xmm1 8369; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 8370; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 8371; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3 8372; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 8373; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm26 8374; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 8375; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 8376; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm3 8377; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 8378; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm27 8379; AVX512DQ-FCP-NEXT: vmovdqa 16(%r10), %xmm2 8380; AVX512DQ-FCP-NEXT: vmovdqa 16(%rax), %xmm3 8381; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 8382; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 8383; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5 8384; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4 8385; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm28 8386; AVX512DQ-FCP-NEXT: vmovdqa 16(%r9), %xmm4 8387; AVX512DQ-FCP-NEXT: vmovdqa 16(%r8), %xmm5 8388; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 8389; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 8390; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm13 8391; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm7 8392; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm29 8393; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 8394; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8395; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1 8396; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 8397; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm30 8398; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 8399; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8400; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm1 8401; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 8402; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm31 8403; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 8404; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8405; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm1 8406; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm17 8407; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 8408; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm18 8409; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 8410; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm12 8411; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 8412; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] 8413; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798] 8414; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 8415; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm2 8416; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 8417; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8418; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854] 8419; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 8420; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3 8421; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} zmm7 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] 8422; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload 8423; AVX512DQ-FCP-NEXT: # zmm3 = mem ^ (zmm7 & (zmm3 ^ mem)) 8424; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] 8425; AVX512DQ-FCP-NEXT: vpandnq %zmm19, %zmm2, %zmm19 8426; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm21, %zmm21 8427; AVX512DQ-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA 8428; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 8429; AVX512DQ-FCP-NEXT: vpord %zmm19, %zmm21, %zmm3 {%k1} 8430; AVX512DQ-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload 8431; AVX512DQ-FCP-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3],xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] 8432; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 8433; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm6 8434; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm8 8435; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm19 8436; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm15 8437; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm8 8438; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] 8439; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm0 8440; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero 8441; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm21, %ymm0 8442; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 8443; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6 8444; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 8445; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15] 8446; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm8 8447; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 8448; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm15, %ymm8 8449; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8450; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 8451; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm15 8452; AVX512DQ-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm0 # 16-byte Folded Reload 8453; AVX512DQ-FCP-NEXT: # xmm0 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] 8454; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8455; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm8 8456; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 8457; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm13 8458; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm0 8459; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm8 8460; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] 8461; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm14 8462; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero 8463; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm14, %ymm21, %ymm14 8464; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 8465; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm11 8466; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11 8467; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] 8468; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm8 8469; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 8470; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm14, %ymm8 8471; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8472; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 8473; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm14 8474; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8475; AVX512DQ-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 8476; AVX512DQ-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 8477; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8478; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 8479; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm8 8480; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 8481; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 8482; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm21 8483; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm8 8484; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm10 8485; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] 8486; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm1 8487; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 8488; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm1, %ymm17, %ymm1 8489; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8490; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 8491; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8492; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] 8493; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm8 8494; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 8495; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 8496; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 8497; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 8498; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 8499; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] 8500; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm9 8501; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 8502; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 8503; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero 8504; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 8505; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 8506; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload 8507; AVX512DQ-FCP-NEXT: # zmm6 = mem ^ (zmm7 & (zmm6 ^ mem)) 8508; AVX512DQ-FCP-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload 8509; AVX512DQ-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload 8510; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm6 {%k1} 8511; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm15 # 64-byte Folded Reload 8512; AVX512DQ-FCP-NEXT: # zmm15 = mem ^ (zmm7 & (zmm15 ^ mem)) 8513; AVX512DQ-FCP-NEXT: vpandnq (%rsp), %zmm2, %zmm5 # 64-byte Folded Reload 8514; AVX512DQ-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload 8515; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm15 {%k1} 8516; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm20 ^ (zmm7 & (zmm11 ^ zmm20)) 8517; AVX512DQ-FCP-NEXT: vpandnq %zmm22, %zmm2, %zmm5 8518; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm23, %zmm8 8519; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm11 {%k1} 8520; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm24 ^ (zmm7 & (zmm14 ^ zmm24)) 8521; AVX512DQ-FCP-NEXT: vpandnq %zmm25, %zmm2, %zmm5 8522; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm26, %zmm8 8523; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm14 {%k1} 8524; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm27 ^ (zmm7 & (zmm0 ^ zmm27)) 8525; AVX512DQ-FCP-NEXT: vpandnq %zmm28, %zmm2, %zmm5 8526; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm29, %zmm8 8527; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm0 {%k1} 8528; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm30 ^ (zmm7 & (zmm1 ^ zmm30)) 8529; AVX512DQ-FCP-NEXT: vpandnq %zmm31, %zmm2, %zmm5 8530; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm16, %zmm8 8531; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm1 {%k1} 8532; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm19 ^ (zmm7 & (zmm4 ^ zmm19)) 8533; AVX512DQ-FCP-NEXT: vpandnq %zmm13, %zmm2, %zmm5 8534; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm21, %zmm2 8535; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm2, %zmm4 {%k1} 8536; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 8537; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 8538; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) 8539; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) 8540; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 320(%rax) 8541; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) 8542; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 448(%rax) 8543; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) 8544; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) 8545; AVX512DQ-FCP-NEXT: addq $392, %rsp # imm = 0x188 8546; AVX512DQ-FCP-NEXT: vzeroupper 8547; AVX512DQ-FCP-NEXT: retq 8548; 8549; AVX512BW-LABEL: store_i8_stride8_vf64: 8550; AVX512BW: # %bb.0: 8551; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 8552; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 8553; AVX512BW-NEXT: vmovdqa (%r10), %xmm0 8554; AVX512BW-NEXT: vmovdqa 16(%r10), %xmm12 8555; AVX512BW-NEXT: vmovdqa64 32(%r10), %xmm16 8556; AVX512BW-NEXT: vmovdqa 48(%r10), %xmm15 8557; AVX512BW-NEXT: vmovdqa (%rax), %xmm2 8558; AVX512BW-NEXT: vmovdqa 16(%rax), %xmm13 8559; AVX512BW-NEXT: vmovdqa64 32(%rax), %xmm17 8560; AVX512BW-NEXT: vmovdqa64 48(%rax), %xmm18 8561; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 8562; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] 8563; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7] 8564; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 8565; AVX512BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] 8566; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3] 8567; AVX512BW-NEXT: vpermw %ymm1, %ymm3, %ymm1 8568; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 8569; AVX512BW-NEXT: vmovdqa (%r9), %xmm4 8570; AVX512BW-NEXT: vmovdqa64 48(%r9), %xmm19 8571; AVX512BW-NEXT: vmovdqa (%r8), %xmm5 8572; AVX512BW-NEXT: vmovdqa64 48(%r8), %xmm21 8573; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 8574; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7] 8575; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,7,7] 8576; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm6 8577; AVX512BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,2,2,3,4,6,6,7] 8578; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3] 8579; AVX512BW-NEXT: vpermw %ymm7, %ymm6, %ymm7 8580; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm14 8581; AVX512BW-NEXT: movl $-2004318072, %eax # imm = 0x88888888 8582; AVX512BW-NEXT: kmovd %eax, %k1 8583; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm14 {%k1} 8584; AVX512BW-NEXT: vmovdqa (%rsi), %xmm7 8585; AVX512BW-NEXT: vmovdqa64 48(%rsi), %xmm24 8586; AVX512BW-NEXT: vmovdqa (%rdi), %xmm8 8587; AVX512BW-NEXT: vmovdqa64 48(%rdi), %xmm27 8588; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] 8589; AVX512BW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] 8590; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero 8591; AVX512BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[3,3,3,3] 8592; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero 8593; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 8594; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 8595; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 8596; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 8597; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 8598; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 8599; AVX512BW-NEXT: vmovdqa (%rcx), %xmm9 8600; AVX512BW-NEXT: vmovdqa64 48(%rcx), %xmm28 8601; AVX512BW-NEXT: vmovdqa (%rdx), %xmm10 8602; AVX512BW-NEXT: vmovdqa64 48(%rdx), %xmm29 8603; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] 8604; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm20[0,1,2,3,4,4,6,5] 8605; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,6,6,7] 8606; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm11, %ymm11 8607; AVX512BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm11[2,1,3,3,6,5,7,7] 8608; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7] 8609; AVX512BW-NEXT: vpermw %ymm20, %ymm11, %ymm20 8610; AVX512BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm20 8611; AVX512BW-NEXT: movl $572662306, %eax # imm = 0x22222222 8612; AVX512BW-NEXT: kmovd %eax, %k2 8613; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm1 {%k2} 8614; AVX512BW-NEXT: movw $-21846, %ax # imm = 0xAAAA 8615; AVX512BW-NEXT: kmovd %eax, %k3 8616; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k3} 8617; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7] 8618; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm14[0,1,2,3,4,4,6,5] 8619; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,6,6,7] 8620; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20 8621; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] 8622; AVX512BW-NEXT: vpermw %ymm14, %ymm3, %ymm14 8623; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm14, %zmm14 8624; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7] 8625; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,5,5,7] 8626; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,6,5,7,7] 8627; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm22, %ymm22 8628; AVX512BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm22[0,2,2,3,4,6,6,7] 8629; AVX512BW-NEXT: vpermw %ymm20, %ymm6, %ymm20 8630; AVX512BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm23 8631; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm23 {%k1} 8632; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm27[0],xmm24[0],xmm27[1],xmm24[1],xmm27[2],xmm24[2],xmm27[3],xmm24[3],xmm27[4],xmm24[4],xmm27[5],xmm24[5],xmm27[6],xmm24[6],xmm27[7],xmm24[7] 8633; AVX512BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm14[2,3,2,3] 8634; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero 8635; AVX512BW-NEXT: vpshufd {{.*#+}} xmm22 = xmm14[3,3,3,3] 8636; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero 8637; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20 8638; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero 8639; AVX512BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1] 8640; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero 8641; AVX512BW-NEXT: vinserti32x4 $1, %xmm14, %ymm22, %ymm14 8642; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm14, %zmm14 8643; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] 8644; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,4,6,5] 8645; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7] 8646; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm25 8647; AVX512BW-NEXT: vmovdqa64 32(%r9), %xmm22 8648; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7] 8649; AVX512BW-NEXT: vpermw %ymm20, %ymm11, %ymm20 8650; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm20 8651; AVX512BW-NEXT: vmovdqa64 32(%r8), %xmm25 8652; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm14 {%k2} 8653; AVX512BW-NEXT: vmovdqa64 32(%rsi), %xmm20 8654; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm14 {%k3} 8655; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15] 8656; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm18 = xmm15[0,1,2,3,4,4,6,5] 8657; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm15[0,1,2,3,4,6,6,7] 8658; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm18, %ymm18 8659; AVX512BW-NEXT: vmovdqa64 32(%rdi), %xmm23 8660; AVX512BW-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[0,2,2,3,4,6,6,7] 8661; AVX512BW-NEXT: vpermw %ymm15, %ymm3, %ymm15 8662; AVX512BW-NEXT: vinserti64x4 $1, %ymm18, %zmm15, %zmm15 8663; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15] 8664; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,5,5,7] 8665; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,6,5,7,7] 8666; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm21 8667; AVX512BW-NEXT: vmovdqa64 32(%rcx), %xmm19 8668; AVX512BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] 8669; AVX512BW-NEXT: vpermw %ymm18, %ymm6, %ymm18 8670; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 8671; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm26 8672; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm18 {%k1} 8673; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm27[8],xmm24[8],xmm27[9],xmm24[9],xmm27[10],xmm24[10],xmm27[11],xmm24[11],xmm27[12],xmm24[12],xmm27[13],xmm24[13],xmm27[14],xmm24[14],xmm27[15],xmm24[15] 8674; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm15[2,3,2,3] 8675; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero 8676; AVX512BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm15[3,3,3,3] 8677; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero 8678; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 8679; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero 8680; AVX512BW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1] 8681; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero 8682; AVX512BW-NEXT: vinserti32x4 $1, %xmm15, %ymm24, %ymm15 8683; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm15, %zmm15 8684; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] 8685; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5] 8686; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,4,6,6,7] 8687; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm24, %ymm24 8688; AVX512BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[2,1,3,3,6,5,7,7] 8689; AVX512BW-NEXT: vpermw %ymm21, %ymm11, %ymm21 8690; AVX512BW-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm21 8691; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm15 {%k2} 8692; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm15 {%k3} 8693; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] 8694; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,4,4,6,5] 8695; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm18[0,1,2,3,4,6,6,7] 8696; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 8697; AVX512BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] 8698; AVX512BW-NEXT: vpermw %ymm18, %ymm3, %ymm18 8699; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 8700; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7] 8701; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,5,5,7] 8702; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,6,5,7,7] 8703; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm24, %ymm24 8704; AVX512BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[0,2,2,3,4,6,6,7] 8705; AVX512BW-NEXT: vpermw %ymm21, %ymm6, %ymm21 8706; AVX512BW-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm27 8707; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm27 {%k1} 8708; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm23[0],xmm20[0],xmm23[1],xmm20[1],xmm23[2],xmm20[2],xmm23[3],xmm20[3],xmm23[4],xmm20[4],xmm23[5],xmm20[5],xmm23[6],xmm20[6],xmm23[7],xmm20[7] 8709; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm18[2,3,2,3] 8710; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero 8711; AVX512BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm18[3,3,3,3] 8712; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero 8713; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 8714; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero 8715; AVX512BW-NEXT: vpshufd {{.*#+}} xmm18 = xmm18[1,1,1,1] 8716; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero 8717; AVX512BW-NEXT: vinserti32x4 $1, %xmm18, %ymm24, %ymm18 8718; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 8719; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm26[0],xmm19[0],xmm26[1],xmm19[1],xmm26[2],xmm19[2],xmm26[3],xmm19[3],xmm26[4],xmm19[4],xmm26[5],xmm19[5],xmm26[6],xmm19[6],xmm26[7],xmm19[7] 8720; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5] 8721; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm28 = xmm21[0,1,2,3,4,6,6,7] 8722; AVX512BW-NEXT: vinserti32x4 $1, %xmm28, %ymm24, %ymm28 8723; AVX512BW-NEXT: vmovdqa64 16(%r9), %xmm24 8724; AVX512BW-NEXT: vpshufd {{.*#+}} ymm28 = ymm28[2,1,3,3,6,5,7,7] 8725; AVX512BW-NEXT: vpermw %ymm21, %ymm11, %ymm21 8726; AVX512BW-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm21 8727; AVX512BW-NEXT: vmovdqa64 16(%r8), %xmm28 8728; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm18 {%k2} 8729; AVX512BW-NEXT: vmovdqa64 16(%rsi), %xmm21 8730; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm18 {%k3} 8731; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] 8732; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm16[0,1,2,3,4,4,6,5] 8733; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm16[0,1,2,3,4,6,6,7] 8734; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm17, %ymm17 8735; AVX512BW-NEXT: vmovdqa64 16(%rdi), %xmm27 8736; AVX512BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7] 8737; AVX512BW-NEXT: vpermw %ymm16, %ymm3, %ymm16 8738; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 8739; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15] 8740; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm22[0,1,2,3,4,5,5,7] 8741; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm22[0,1,2,3,6,5,7,7] 8742; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm17, %ymm25 8743; AVX512BW-NEXT: vmovdqa64 16(%rcx), %xmm17 8744; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7] 8745; AVX512BW-NEXT: vpermw %ymm22, %ymm6, %ymm22 8746; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm22, %zmm25 8747; AVX512BW-NEXT: vmovdqa64 16(%rdx), %xmm22 8748; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm25 {%k1} 8749; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm23[8],xmm20[8],xmm23[9],xmm20[9],xmm23[10],xmm20[10],xmm23[11],xmm20[11],xmm23[12],xmm20[12],xmm23[13],xmm20[13],xmm23[14],xmm20[14],xmm23[15],xmm20[15] 8750; AVX512BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm16[2,3,2,3] 8751; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero 8752; AVX512BW-NEXT: vpshufd {{.*#+}} xmm23 = xmm16[3,3,3,3] 8753; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero 8754; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 8755; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero 8756; AVX512BW-NEXT: vpshufd {{.*#+}} xmm16 = xmm16[1,1,1,1] 8757; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero 8758; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm23, %ymm16 8759; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm16 8760; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm19[8],xmm26[9],xmm19[9],xmm26[10],xmm19[10],xmm26[11],xmm19[11],xmm26[12],xmm19[12],xmm26[13],xmm19[13],xmm26[14],xmm19[14],xmm26[15],xmm19[15] 8761; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5] 8762; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7] 8763; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 8764; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7] 8765; AVX512BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 8766; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19 8767; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm16 {%k2} 8768; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k3} 8769; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] 8770; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5] 8771; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7] 8772; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 8773; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] 8774; AVX512BW-NEXT: vpermw %ymm19, %ymm3, %ymm19 8775; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19 8776; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm28[0],xmm24[0],xmm28[1],xmm24[1],xmm28[2],xmm24[2],xmm28[3],xmm24[3],xmm28[4],xmm24[4],xmm28[5],xmm24[5],xmm28[6],xmm24[6],xmm28[7],xmm24[7] 8777; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,5,5,7] 8778; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,6,5,7,7] 8779; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm23, %ymm23 8780; AVX512BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7] 8781; AVX512BW-NEXT: vpermw %ymm20, %ymm6, %ymm20 8782; AVX512BW-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm20 8783; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm20 {%k1} 8784; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3],xmm27[4],xmm21[4],xmm27[5],xmm21[5],xmm27[6],xmm21[6],xmm27[7],xmm21[7] 8785; AVX512BW-NEXT: vpshufd {{.*#+}} xmm23 = xmm19[2,3,2,3] 8786; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero 8787; AVX512BW-NEXT: vpshufd {{.*#+}} xmm25 = xmm19[3,3,3,3] 8788; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero 8789; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm23, %ymm23 8790; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero 8791; AVX512BW-NEXT: vpshufd {{.*#+}} xmm19 = xmm19[1,1,1,1] 8792; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero 8793; AVX512BW-NEXT: vinserti32x4 $1, %xmm19, %ymm25, %ymm19 8794; AVX512BW-NEXT: vinserti64x4 $1, %ymm23, %zmm19, %zmm19 8795; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm17[0],xmm22[1],xmm17[1],xmm22[2],xmm17[2],xmm22[3],xmm17[3],xmm22[4],xmm17[4],xmm22[5],xmm17[5],xmm22[6],xmm17[6],xmm22[7],xmm17[7] 8796; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm23[0,1,2,3,4,4,6,5] 8797; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm23[0,1,2,3,4,6,6,7] 8798; AVX512BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 8799; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7] 8800; AVX512BW-NEXT: vpermw %ymm23, %ymm11, %ymm23 8801; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm23, %zmm23 8802; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} 8803; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k3} 8804; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] 8805; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5] 8806; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm12[0,1,2,3,4,6,6,7] 8807; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm13, %ymm13 8808; AVX512BW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] 8809; AVX512BW-NEXT: vpermw %ymm12, %ymm3, %ymm12 8810; AVX512BW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 8811; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm28[8],xmm24[8],xmm28[9],xmm24[9],xmm28[10],xmm24[10],xmm28[11],xmm24[11],xmm28[12],xmm24[12],xmm28[13],xmm24[13],xmm28[14],xmm24[14],xmm28[15],xmm24[15] 8812; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm13[0,1,2,3,4,5,5,7] 8813; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm13[0,1,2,3,6,5,7,7] 8814; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 8815; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] 8816; AVX512BW-NEXT: vpermw %ymm13, %ymm6, %ymm13 8817; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm13, %zmm13 8818; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm13 {%k1} 8819; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm27[8],xmm21[8],xmm27[9],xmm21[9],xmm27[10],xmm21[10],xmm27[11],xmm21[11],xmm27[12],xmm21[12],xmm27[13],xmm21[13],xmm27[14],xmm21[14],xmm27[15],xmm21[15] 8820; AVX512BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm12[2,3,2,3] 8821; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero 8822; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm12[3,3,3,3] 8823; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero 8824; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 8825; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero 8826; AVX512BW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] 8827; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero 8828; AVX512BW-NEXT: vinserti32x4 $1, %xmm12, %ymm21, %ymm12 8829; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm12, %zmm12 8830; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm22[8],xmm17[8],xmm22[9],xmm17[9],xmm22[10],xmm17[10],xmm22[11],xmm17[11],xmm22[12],xmm17[12],xmm22[13],xmm17[13],xmm22[14],xmm17[14],xmm22[15],xmm17[15] 8831; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm17[0,1,2,3,4,4,6,5] 8832; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm17[0,1,2,3,4,6,6,7] 8833; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 8834; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7] 8835; AVX512BW-NEXT: vpermw %ymm17, %ymm11, %ymm17 8836; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17 8837; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm12 {%k2} 8838; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k3} 8839; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 8840; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] 8841; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,6,6,7] 8842; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm2 8843; AVX512BW-NEXT: vpermw %ymm0, %ymm3, %ymm0 8844; AVX512BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 8845; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 8846; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 8847; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] 8848; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7] 8849; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 8850; AVX512BW-NEXT: vpermw %ymm2, %ymm6, %ymm2 8851; AVX512BW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 8852; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 8853; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} 8854; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] 8855; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 8856; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 8857; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] 8858; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 8859; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 8860; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 8861; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 8862; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 8863; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 8864; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 8865; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] 8866; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] 8867; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] 8868; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 8869; AVX512BW-NEXT: vpermw %ymm3, %ymm11, %ymm3 8870; AVX512BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] 8871; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 8872; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k2} 8873; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} 8874; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 8875; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) 8876; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) 8877; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rax) 8878; AVX512BW-NEXT: vmovdqa64 %zmm16, 320(%rax) 8879; AVX512BW-NEXT: vmovdqa64 %zmm18, 256(%rax) 8880; AVX512BW-NEXT: vmovdqa64 %zmm15, 448(%rax) 8881; AVX512BW-NEXT: vmovdqa64 %zmm14, 384(%rax) 8882; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) 8883; AVX512BW-NEXT: vzeroupper 8884; AVX512BW-NEXT: retq 8885; 8886; AVX512BW-FCP-LABEL: store_i8_stride8_vf64: 8887; AVX512BW-FCP: # %bb.0: 8888; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 8889; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 8890; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 8891; AVX512BW-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8892; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm20 8893; AVX512BW-FCP-NEXT: vmovdqa64 48(%rsi), %xmm17 8894; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm2 8895; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm21 8896; AVX512BW-FCP-NEXT: vmovdqa64 48(%rdi), %xmm18 8897; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 8898; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 8899; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854] 8900; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 8901; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798] 8902; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm6 8903; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 8904; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 8905; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 8906; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 8907; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm22 8908; AVX512BW-FCP-NEXT: vmovdqa64 48(%rcx), %xmm19 8909; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm7 8910; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm23 8911; AVX512BW-FCP-NEXT: vmovdqa64 48(%rdx), %xmm24 8912; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] 8913; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] 8914; AVX512BW-FCP-NEXT: movl $572662306, %r11d # imm = 0x22222222 8915; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 8916; AVX512BW-FCP-NEXT: vpermw %zmm6, %zmm8, %zmm1 {%k1} 8917; AVX512BW-FCP-NEXT: vmovdqa (%r10), %xmm6 8918; AVX512BW-FCP-NEXT: vmovdqa64 48(%r10), %xmm25 8919; AVX512BW-FCP-NEXT: vmovdqa (%rax), %xmm9 8920; AVX512BW-FCP-NEXT: vmovdqa64 48(%rax), %xmm26 8921; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] 8922; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm10 8923; AVX512BW-FCP-NEXT: vmovdqa64 48(%r9), %xmm27 8924; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm11 8925; AVX512BW-FCP-NEXT: vmovdqa64 48(%r8), %xmm28 8926; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] 8927; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] 8928; AVX512BW-FCP-NEXT: vpermw %zmm12, %zmm13, %zmm12 8929; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] 8930; AVX512BW-FCP-NEXT: movl $-2004318072, %r11d # imm = 0x88888888 8931; AVX512BW-FCP-NEXT: kmovd %r11d, %k2 8932; AVX512BW-FCP-NEXT: vpermw %zmm15, %zmm14, %zmm12 {%k2} 8933; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] 8934; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm15, %ymm15, %ymm16 8935; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm16, %ymm16 8936; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm29 8937; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero 8938; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm29, %ymm15, %ymm15 8939; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 8940; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm24[0],xmm19[0],xmm24[1],xmm19[1],xmm24[2],xmm19[2],xmm24[3],xmm19[3],xmm24[4],xmm19[4],xmm24[5],xmm19[5],xmm24[6],xmm19[6],xmm24[7],xmm19[7] 8941; AVX512BW-FCP-NEXT: vpermw %zmm16, %zmm8, %zmm15 {%k1} 8942; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm29 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] 8943; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7] 8944; AVX512BW-FCP-NEXT: vpermw %zmm16, %zmm13, %zmm16 8945; AVX512BW-FCP-NEXT: vpermw %zmm29, %zmm14, %zmm16 {%k2} 8946; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] 8947; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm17, %xmm18 8948; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero 8949; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm29, %ymm18 8950; AVX512BW-FCP-NEXT: vmovdqa64 32(%r10), %xmm29 8951; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 8952; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm17, %ymm17 8953; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 8954; AVX512BW-FCP-NEXT: vmovdqa64 32(%rax), %xmm30 8955; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm24[8],xmm19[8],xmm24[9],xmm19[9],xmm24[10],xmm19[10],xmm24[11],xmm19[11],xmm24[12],xmm19[12],xmm24[13],xmm19[13],xmm24[14],xmm19[14],xmm24[15],xmm19[15] 8956; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm31 8957; AVX512BW-FCP-NEXT: vpermw %zmm18, %zmm8, %zmm17 {%k1} 8958; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %xmm0 8959; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] 8960; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] 8961; AVX512BW-FCP-NEXT: vpermw %zmm18, %zmm13, %zmm18 8962; AVX512BW-FCP-NEXT: vpermw %zmm19, %zmm14, %zmm18 {%k2} 8963; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7] 8964; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm19, %ymm19, %ymm24 8965; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm24, %ymm24 8966; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm19, %xmm25 8967; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero 8968; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm19, %ymm19 8969; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm24, %zmm19, %zmm19 8970; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] 8971; AVX512BW-FCP-NEXT: vpermw %zmm24, %zmm8, %zmm19 {%k1} 8972; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7] 8973; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm0[0],xmm31[0],xmm0[1],xmm31[1],xmm0[2],xmm31[2],xmm0[3],xmm31[3],xmm0[4],xmm31[4],xmm0[5],xmm31[5],xmm0[6],xmm31[6],xmm0[7],xmm31[7] 8974; AVX512BW-FCP-NEXT: vpermw %zmm24, %zmm13, %zmm24 8975; AVX512BW-FCP-NEXT: vpermw %zmm25, %zmm14, %zmm24 {%k2} 8976; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] 8977; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm20, %xmm21 8978; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero 8979; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm25, %ymm21 8980; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 8981; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm20, %ymm20 8982; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm21, %zmm20 8983; AVX512BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm25 8984; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] 8985; AVX512BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm23 8986; AVX512BW-FCP-NEXT: vpermw %zmm21, %zmm8, %zmm20 {%k1} 8987; AVX512BW-FCP-NEXT: vmovdqa64 16(%rcx), %xmm26 8988; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15] 8989; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm31[8],xmm0[9],xmm31[9],xmm0[10],xmm31[10],xmm0[11],xmm31[11],xmm0[12],xmm31[12],xmm0[13],xmm31[13],xmm0[14],xmm31[14],xmm0[15],xmm31[15] 8990; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm13, %zmm21 8991; AVX512BW-FCP-NEXT: vpermw %zmm22, %zmm14, %zmm21 {%k2} 8992; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3],xmm23[4],xmm25[4],xmm23[5],xmm25[5],xmm23[6],xmm25[6],xmm23[7],xmm25[7] 8993; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm22 8994; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 8995; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm27, %ymm22 8996; AVX512BW-FCP-NEXT: vmovdqa64 16(%rdx), %xmm27 8997; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8998; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 8999; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm22, %zmm22 9000; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] 9001; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm8, %zmm22 {%k1} 9002; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm25[8],xmm23[9],xmm25[9],xmm23[10],xmm25[10],xmm23[11],xmm25[11],xmm23[12],xmm25[12],xmm23[13],xmm25[13],xmm23[14],xmm25[14],xmm23[15],xmm25[15] 9003; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm23 9004; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 9005; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm25, %ymm23 9006; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9007; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 9008; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm23, %zmm0 9009; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] 9010; AVX512BW-FCP-NEXT: vpermw %zmm23, %zmm8, %zmm0 {%k1} 9011; AVX512BW-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 9012; AVX512BW-FCP-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] 9013; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm2, %ymm23 9014; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm23, %ymm4 9015; AVX512BW-FCP-NEXT: vmovdqa64 16(%r10), %xmm23 9016; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 9017; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 9018; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 9019; AVX512BW-FCP-NEXT: vmovdqa 16(%rax), %xmm5 9020; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 9021; AVX512BW-FCP-NEXT: vmovdqa 16(%r9), %xmm4 9022; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 9023; AVX512BW-FCP-NEXT: vmovdqa 16(%r8), %xmm7 9024; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm8, %zmm2 {%k1} 9025; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm23[0],xmm5[1],xmm23[1],xmm5[2],xmm23[2],xmm5[3],xmm23[3],xmm5[4],xmm23[4],xmm5[5],xmm23[5],xmm5[6],xmm23[6],xmm5[7],xmm23[7] 9026; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] 9027; AVX512BW-FCP-NEXT: vpermw %zmm8, %zmm13, %zmm8 9028; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm8 {%k2} 9029; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm23[8],xmm5[9],xmm23[9],xmm5[10],xmm23[10],xmm5[11],xmm23[11],xmm5[12],xmm23[12],xmm5[13],xmm23[13],xmm5[14],xmm23[14],xmm5[15],xmm23[15] 9030; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] 9031; AVX512BW-FCP-NEXT: vpermw %zmm4, %zmm13, %zmm4 9032; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm4 {%k2} 9033; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] 9034; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] 9035; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm13, %zmm5 9036; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm5 {%k2} 9037; AVX512BW-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA 9038; AVX512BW-FCP-NEXT: kmovd %eax, %k1 9039; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} 9040; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1} 9041; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k1} 9042; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm19 {%k1} 9043; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm20 {%k1} 9044; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm22 {%k1} 9045; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} 9046; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1} 9047; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 9048; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) 9049; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) 9050; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 128(%rax) 9051; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%rax) 9052; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 256(%rax) 9053; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax) 9054; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 384(%rax) 9055; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) 9056; AVX512BW-FCP-NEXT: vzeroupper 9057; AVX512BW-FCP-NEXT: retq 9058; 9059; AVX512DQ-BW-LABEL: store_i8_stride8_vf64: 9060; AVX512DQ-BW: # %bb.0: 9061; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 9062; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 9063; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm0 9064; AVX512DQ-BW-NEXT: vmovdqa 16(%r10), %xmm12 9065; AVX512DQ-BW-NEXT: vmovdqa64 32(%r10), %xmm16 9066; AVX512DQ-BW-NEXT: vmovdqa 48(%r10), %xmm15 9067; AVX512DQ-BW-NEXT: vmovdqa (%rax), %xmm2 9068; AVX512DQ-BW-NEXT: vmovdqa 16(%rax), %xmm13 9069; AVX512DQ-BW-NEXT: vmovdqa64 32(%rax), %xmm17 9070; AVX512DQ-BW-NEXT: vmovdqa64 48(%rax), %xmm18 9071; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 9072; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] 9073; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7] 9074; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 9075; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] 9076; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3] 9077; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm3, %ymm1 9078; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 9079; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm4 9080; AVX512DQ-BW-NEXT: vmovdqa64 48(%r9), %xmm19 9081; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm5 9082; AVX512DQ-BW-NEXT: vmovdqa64 48(%r8), %xmm21 9083; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 9084; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7] 9085; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,7,7] 9086; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm6 9087; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,2,2,3,4,6,6,7] 9088; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3] 9089; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm6, %ymm7 9090; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm14 9091; AVX512DQ-BW-NEXT: movl $-2004318072, %eax # imm = 0x88888888 9092; AVX512DQ-BW-NEXT: kmovd %eax, %k1 9093; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm14 {%k1} 9094; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm7 9095; AVX512DQ-BW-NEXT: vmovdqa64 48(%rsi), %xmm24 9096; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm8 9097; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdi), %xmm27 9098; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] 9099; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] 9100; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero 9101; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[3,3,3,3] 9102; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero 9103; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 9104; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 9105; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 9106; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 9107; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 9108; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 9109; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm9 9110; AVX512DQ-BW-NEXT: vmovdqa64 48(%rcx), %xmm28 9111; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm10 9112; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdx), %xmm29 9113; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] 9114; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm20[0,1,2,3,4,4,6,5] 9115; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,6,6,7] 9116; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm11, %ymm11 9117; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm11[2,1,3,3,6,5,7,7] 9118; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7] 9119; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm11, %ymm20 9120; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm20 9121; AVX512DQ-BW-NEXT: movl $572662306, %eax # imm = 0x22222222 9122; AVX512DQ-BW-NEXT: kmovd %eax, %k2 9123; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm1 {%k2} 9124; AVX512DQ-BW-NEXT: movw $-21846, %ax # imm = 0xAAAA 9125; AVX512DQ-BW-NEXT: kmovd %eax, %k3 9126; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k3} 9127; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7] 9128; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm14[0,1,2,3,4,4,6,5] 9129; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,6,6,7] 9130; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20 9131; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] 9132; AVX512DQ-BW-NEXT: vpermw %ymm14, %ymm3, %ymm14 9133; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm14, %zmm14 9134; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7] 9135; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,5,5,7] 9136; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,6,5,7,7] 9137; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm22, %ymm22 9138; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm22[0,2,2,3,4,6,6,7] 9139; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm6, %ymm20 9140; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm23 9141; AVX512DQ-BW-NEXT: vmovdqu16 %zmm14, %zmm23 {%k1} 9142; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm27[0],xmm24[0],xmm27[1],xmm24[1],xmm27[2],xmm24[2],xmm27[3],xmm24[3],xmm27[4],xmm24[4],xmm27[5],xmm24[5],xmm27[6],xmm24[6],xmm27[7],xmm24[7] 9143; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm14[2,3,2,3] 9144; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero 9145; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm22 = xmm14[3,3,3,3] 9146; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero 9147; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20 9148; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero 9149; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1] 9150; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero 9151; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm14, %ymm22, %ymm14 9152; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm14, %zmm14 9153; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] 9154; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,4,6,5] 9155; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7] 9156; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm25 9157; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %xmm22 9158; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7] 9159; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm11, %ymm20 9160; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm20 9161; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %xmm25 9162; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm14 {%k2} 9163; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %xmm20 9164; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm14 {%k3} 9165; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15] 9166; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm18 = xmm15[0,1,2,3,4,4,6,5] 9167; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm15[0,1,2,3,4,6,6,7] 9168; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm18, %ymm18 9169; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %xmm23 9170; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[0,2,2,3,4,6,6,7] 9171; AVX512DQ-BW-NEXT: vpermw %ymm15, %ymm3, %ymm15 9172; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm18, %zmm15, %zmm15 9173; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15] 9174; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,5,5,7] 9175; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,6,5,7,7] 9176; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm21 9177; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %xmm19 9178; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] 9179; AVX512DQ-BW-NEXT: vpermw %ymm18, %ymm6, %ymm18 9180; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 9181; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm26 9182; AVX512DQ-BW-NEXT: vmovdqu16 %zmm15, %zmm18 {%k1} 9183; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm27[8],xmm24[8],xmm27[9],xmm24[9],xmm27[10],xmm24[10],xmm27[11],xmm24[11],xmm27[12],xmm24[12],xmm27[13],xmm24[13],xmm27[14],xmm24[14],xmm27[15],xmm24[15] 9184; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm15[2,3,2,3] 9185; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero 9186; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm15[3,3,3,3] 9187; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero 9188; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 9189; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero 9190; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1] 9191; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero 9192; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm15, %ymm24, %ymm15 9193; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm15, %zmm15 9194; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] 9195; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5] 9196; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,4,6,6,7] 9197; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm24, %ymm24 9198; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[2,1,3,3,6,5,7,7] 9199; AVX512DQ-BW-NEXT: vpermw %ymm21, %ymm11, %ymm21 9200; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm21 9201; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm15 {%k2} 9202; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm15 {%k3} 9203; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] 9204; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,4,4,6,5] 9205; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm18[0,1,2,3,4,6,6,7] 9206; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 9207; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] 9208; AVX512DQ-BW-NEXT: vpermw %ymm18, %ymm3, %ymm18 9209; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 9210; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7] 9211; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,5,5,7] 9212; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,6,5,7,7] 9213; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm24, %ymm24 9214; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[0,2,2,3,4,6,6,7] 9215; AVX512DQ-BW-NEXT: vpermw %ymm21, %ymm6, %ymm21 9216; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm27 9217; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm27 {%k1} 9218; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm23[0],xmm20[0],xmm23[1],xmm20[1],xmm23[2],xmm20[2],xmm23[3],xmm20[3],xmm23[4],xmm20[4],xmm23[5],xmm20[5],xmm23[6],xmm20[6],xmm23[7],xmm20[7] 9219; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm18[2,3,2,3] 9220; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero 9221; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm18[3,3,3,3] 9222; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero 9223; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 9224; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero 9225; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm18 = xmm18[1,1,1,1] 9226; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero 9227; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm18, %ymm24, %ymm18 9228; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 9229; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm26[0],xmm19[0],xmm26[1],xmm19[1],xmm26[2],xmm19[2],xmm26[3],xmm19[3],xmm26[4],xmm19[4],xmm26[5],xmm19[5],xmm26[6],xmm19[6],xmm26[7],xmm19[7] 9230; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5] 9231; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm28 = xmm21[0,1,2,3,4,6,6,7] 9232; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm28, %ymm24, %ymm28 9233; AVX512DQ-BW-NEXT: vmovdqa64 16(%r9), %xmm24 9234; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm28 = ymm28[2,1,3,3,6,5,7,7] 9235; AVX512DQ-BW-NEXT: vpermw %ymm21, %ymm11, %ymm21 9236; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm21 9237; AVX512DQ-BW-NEXT: vmovdqa64 16(%r8), %xmm28 9238; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm18 {%k2} 9239; AVX512DQ-BW-NEXT: vmovdqa64 16(%rsi), %xmm21 9240; AVX512DQ-BW-NEXT: vmovdqa32 %zmm27, %zmm18 {%k3} 9241; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] 9242; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm16[0,1,2,3,4,4,6,5] 9243; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm16[0,1,2,3,4,6,6,7] 9244; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm17, %ymm17 9245; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdi), %xmm27 9246; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7] 9247; AVX512DQ-BW-NEXT: vpermw %ymm16, %ymm3, %ymm16 9248; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 9249; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15] 9250; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm22[0,1,2,3,4,5,5,7] 9251; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm22[0,1,2,3,6,5,7,7] 9252; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm17, %ymm25 9253; AVX512DQ-BW-NEXT: vmovdqa64 16(%rcx), %xmm17 9254; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7] 9255; AVX512DQ-BW-NEXT: vpermw %ymm22, %ymm6, %ymm22 9256; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm22, %zmm25 9257; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdx), %xmm22 9258; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm25 {%k1} 9259; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm23[8],xmm20[8],xmm23[9],xmm20[9],xmm23[10],xmm20[10],xmm23[11],xmm20[11],xmm23[12],xmm20[12],xmm23[13],xmm20[13],xmm23[14],xmm20[14],xmm23[15],xmm20[15] 9260; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm16[2,3,2,3] 9261; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero 9262; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm23 = xmm16[3,3,3,3] 9263; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero 9264; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 9265; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero 9266; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm16 = xmm16[1,1,1,1] 9267; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero 9268; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm23, %ymm16 9269; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm16 9270; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm19[8],xmm26[9],xmm19[9],xmm26[10],xmm19[10],xmm26[11],xmm19[11],xmm26[12],xmm19[12],xmm26[13],xmm19[13],xmm26[14],xmm19[14],xmm26[15],xmm19[15] 9271; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5] 9272; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7] 9273; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 9274; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7] 9275; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 9276; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19 9277; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm16 {%k2} 9278; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k3} 9279; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] 9280; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5] 9281; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7] 9282; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 9283; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] 9284; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm3, %ymm19 9285; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19 9286; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm28[0],xmm24[0],xmm28[1],xmm24[1],xmm28[2],xmm24[2],xmm28[3],xmm24[3],xmm28[4],xmm24[4],xmm28[5],xmm24[5],xmm28[6],xmm24[6],xmm28[7],xmm24[7] 9287; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,5,5,7] 9288; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,6,5,7,7] 9289; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm23, %ymm23 9290; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7] 9291; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm6, %ymm20 9292; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm20 9293; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm20 {%k1} 9294; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3],xmm27[4],xmm21[4],xmm27[5],xmm21[5],xmm27[6],xmm21[6],xmm27[7],xmm21[7] 9295; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm23 = xmm19[2,3,2,3] 9296; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero 9297; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm25 = xmm19[3,3,3,3] 9298; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero 9299; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm23, %ymm23 9300; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero 9301; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm19 = xmm19[1,1,1,1] 9302; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero 9303; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm19, %ymm25, %ymm19 9304; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm23, %zmm19, %zmm19 9305; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm17[0],xmm22[1],xmm17[1],xmm22[2],xmm17[2],xmm22[3],xmm17[3],xmm22[4],xmm17[4],xmm22[5],xmm17[5],xmm22[6],xmm17[6],xmm22[7],xmm17[7] 9306; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm23[0,1,2,3,4,4,6,5] 9307; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm23[0,1,2,3,4,6,6,7] 9308; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 9309; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7] 9310; AVX512DQ-BW-NEXT: vpermw %ymm23, %ymm11, %ymm23 9311; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm23, %zmm23 9312; AVX512DQ-BW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} 9313; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k3} 9314; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] 9315; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5] 9316; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm12[0,1,2,3,4,6,6,7] 9317; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm20, %ymm13, %ymm13 9318; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] 9319; AVX512DQ-BW-NEXT: vpermw %ymm12, %ymm3, %ymm12 9320; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 9321; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm28[8],xmm24[8],xmm28[9],xmm24[9],xmm28[10],xmm24[10],xmm28[11],xmm24[11],xmm28[12],xmm24[12],xmm28[13],xmm24[13],xmm28[14],xmm24[14],xmm28[15],xmm24[15] 9322; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm13[0,1,2,3,4,5,5,7] 9323; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm13[0,1,2,3,6,5,7,7] 9324; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 9325; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] 9326; AVX512DQ-BW-NEXT: vpermw %ymm13, %ymm6, %ymm13 9327; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm13, %zmm13 9328; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm13 {%k1} 9329; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm27[8],xmm21[8],xmm27[9],xmm21[9],xmm27[10],xmm21[10],xmm27[11],xmm21[11],xmm27[12],xmm21[12],xmm27[13],xmm21[13],xmm27[14],xmm21[14],xmm27[15],xmm21[15] 9330; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm12[2,3,2,3] 9331; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero 9332; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm12[3,3,3,3] 9333; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero 9334; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 9335; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero 9336; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] 9337; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero 9338; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm12, %ymm21, %ymm12 9339; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm12, %zmm12 9340; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm22[8],xmm17[8],xmm22[9],xmm17[9],xmm22[10],xmm17[10],xmm22[11],xmm17[11],xmm22[12],xmm17[12],xmm22[13],xmm17[13],xmm22[14],xmm17[14],xmm22[15],xmm17[15] 9341; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm17[0,1,2,3,4,4,6,5] 9342; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm17[0,1,2,3,4,6,6,7] 9343; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 9344; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7] 9345; AVX512DQ-BW-NEXT: vpermw %ymm17, %ymm11, %ymm17 9346; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17 9347; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm12 {%k2} 9348; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k3} 9349; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 9350; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] 9351; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,6,6,7] 9352; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm2 9353; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm3, %ymm0 9354; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 9355; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 9356; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 9357; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] 9358; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7] 9359; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 9360; AVX512DQ-BW-NEXT: vpermw %ymm2, %ymm6, %ymm2 9361; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 9362; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 9363; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} 9364; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] 9365; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 9366; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 9367; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] 9368; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 9369; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 9370; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 9371; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 9372; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 9373; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 9374; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 9375; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] 9376; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] 9377; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] 9378; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 9379; AVX512DQ-BW-NEXT: vpermw %ymm3, %ymm11, %ymm3 9380; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] 9381; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 9382; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k2} 9383; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} 9384; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 9385; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) 9386; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%rax) 9387; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 128(%rax) 9388; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 320(%rax) 9389; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 256(%rax) 9390; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 448(%rax) 9391; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 384(%rax) 9392; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%rax) 9393; AVX512DQ-BW-NEXT: vzeroupper 9394; AVX512DQ-BW-NEXT: retq 9395; 9396; AVX512DQ-BW-FCP-LABEL: store_i8_stride8_vf64: 9397; AVX512DQ-BW-FCP: # %bb.0: 9398; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 9399; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 9400; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 9401; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9402; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm20 9403; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rsi), %xmm17 9404; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm2 9405; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm21 9406; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rdi), %xmm18 9407; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 9408; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 9409; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854] 9410; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 9411; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798] 9412; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm6 9413; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 9414; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 9415; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 9416; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 9417; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm22 9418; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rcx), %xmm19 9419; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm7 9420; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm23 9421; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rdx), %xmm24 9422; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] 9423; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] 9424; AVX512DQ-BW-FCP-NEXT: movl $572662306, %r11d # imm = 0x22222222 9425; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 9426; AVX512DQ-BW-FCP-NEXT: vpermw %zmm6, %zmm8, %zmm1 {%k1} 9427; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r10), %xmm6 9428; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r10), %xmm25 9429; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rax), %xmm9 9430; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rax), %xmm26 9431; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] 9432; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm10 9433; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r9), %xmm27 9434; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm11 9435; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r8), %xmm28 9436; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] 9437; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] 9438; AVX512DQ-BW-FCP-NEXT: vpermw %zmm12, %zmm13, %zmm12 9439; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] 9440; AVX512DQ-BW-FCP-NEXT: movl $-2004318072, %r11d # imm = 0x88888888 9441; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k2 9442; AVX512DQ-BW-FCP-NEXT: vpermw %zmm15, %zmm14, %zmm12 {%k2} 9443; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] 9444; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm15, %ymm15, %ymm16 9445; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm16, %ymm16 9446; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm29 9447; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero 9448; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm29, %ymm15, %ymm15 9449; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 9450; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm24[0],xmm19[0],xmm24[1],xmm19[1],xmm24[2],xmm19[2],xmm24[3],xmm19[3],xmm24[4],xmm19[4],xmm24[5],xmm19[5],xmm24[6],xmm19[6],xmm24[7],xmm19[7] 9451; AVX512DQ-BW-FCP-NEXT: vpermw %zmm16, %zmm8, %zmm15 {%k1} 9452; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm29 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] 9453; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7] 9454; AVX512DQ-BW-FCP-NEXT: vpermw %zmm16, %zmm13, %zmm16 9455; AVX512DQ-BW-FCP-NEXT: vpermw %zmm29, %zmm14, %zmm16 {%k2} 9456; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] 9457; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm17, %xmm18 9458; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero 9459; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm29, %ymm18 9460; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r10), %xmm29 9461; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 9462; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm17, %ymm17 9463; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 9464; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rax), %xmm30 9465; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm24[8],xmm19[8],xmm24[9],xmm19[9],xmm24[10],xmm19[10],xmm24[11],xmm19[11],xmm24[12],xmm19[12],xmm24[13],xmm19[13],xmm24[14],xmm19[14],xmm24[15],xmm19[15] 9466; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm31 9467; AVX512DQ-BW-FCP-NEXT: vpermw %zmm18, %zmm8, %zmm17 {%k1} 9468; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %xmm0 9469; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] 9470; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] 9471; AVX512DQ-BW-FCP-NEXT: vpermw %zmm18, %zmm13, %zmm18 9472; AVX512DQ-BW-FCP-NEXT: vpermw %zmm19, %zmm14, %zmm18 {%k2} 9473; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7] 9474; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm19, %ymm19, %ymm24 9475; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm24, %ymm24 9476; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm19, %xmm25 9477; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero 9478; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm19, %ymm19 9479; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm24, %zmm19, %zmm19 9480; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] 9481; AVX512DQ-BW-FCP-NEXT: vpermw %zmm24, %zmm8, %zmm19 {%k1} 9482; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7] 9483; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm0[0],xmm31[0],xmm0[1],xmm31[1],xmm0[2],xmm31[2],xmm0[3],xmm31[3],xmm0[4],xmm31[4],xmm0[5],xmm31[5],xmm0[6],xmm31[6],xmm0[7],xmm31[7] 9484; AVX512DQ-BW-FCP-NEXT: vpermw %zmm24, %zmm13, %zmm24 9485; AVX512DQ-BW-FCP-NEXT: vpermw %zmm25, %zmm14, %zmm24 {%k2} 9486; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] 9487; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm20, %xmm21 9488; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero 9489; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm25, %ymm21 9490; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 9491; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm20, %ymm20 9492; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm21, %zmm20 9493; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm25 9494; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] 9495; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm23 9496; AVX512DQ-BW-FCP-NEXT: vpermw %zmm21, %zmm8, %zmm20 {%k1} 9497; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rcx), %xmm26 9498; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15] 9499; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm31[8],xmm0[9],xmm31[9],xmm0[10],xmm31[10],xmm0[11],xmm31[11],xmm0[12],xmm31[12],xmm0[13],xmm31[13],xmm0[14],xmm31[14],xmm0[15],xmm31[15] 9500; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm13, %zmm21 9501; AVX512DQ-BW-FCP-NEXT: vpermw %zmm22, %zmm14, %zmm21 {%k2} 9502; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3],xmm23[4],xmm25[4],xmm23[5],xmm25[5],xmm23[6],xmm25[6],xmm23[7],xmm25[7] 9503; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm22 9504; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 9505; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm27, %ymm22 9506; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rdx), %xmm27 9507; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9508; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 9509; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm22, %zmm22 9510; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] 9511; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm8, %zmm22 {%k1} 9512; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm25[8],xmm23[9],xmm25[9],xmm23[10],xmm25[10],xmm23[11],xmm25[11],xmm23[12],xmm25[12],xmm23[13],xmm25[13],xmm23[14],xmm25[14],xmm23[15],xmm25[15] 9513; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm23 9514; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 9515; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm25, %ymm23 9516; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9517; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 9518; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm23, %zmm0 9519; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] 9520; AVX512DQ-BW-FCP-NEXT: vpermw %zmm23, %zmm8, %zmm0 {%k1} 9521; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 9522; AVX512DQ-BW-FCP-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] 9523; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm2, %ymm23 9524; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm23, %ymm4 9525; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%r10), %xmm23 9526; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 9527; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 9528; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 9529; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rax), %xmm5 9530; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 9531; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r9), %xmm4 9532; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 9533; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r8), %xmm7 9534; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm8, %zmm2 {%k1} 9535; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm23[0],xmm5[1],xmm23[1],xmm5[2],xmm23[2],xmm5[3],xmm23[3],xmm5[4],xmm23[4],xmm5[5],xmm23[5],xmm5[6],xmm23[6],xmm5[7],xmm23[7] 9536; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] 9537; AVX512DQ-BW-FCP-NEXT: vpermw %zmm8, %zmm13, %zmm8 9538; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm8 {%k2} 9539; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm23[8],xmm5[9],xmm23[9],xmm5[10],xmm23[10],xmm5[11],xmm23[11],xmm5[12],xmm23[12],xmm5[13],xmm23[13],xmm5[14],xmm23[14],xmm5[15],xmm23[15] 9540; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] 9541; AVX512DQ-BW-FCP-NEXT: vpermw %zmm4, %zmm13, %zmm4 9542; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm4 {%k2} 9543; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] 9544; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] 9545; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm13, %zmm5 9546; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm5 {%k2} 9547; AVX512DQ-BW-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA 9548; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 9549; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} 9550; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1} 9551; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k1} 9552; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm19 {%k1} 9553; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm20 {%k1} 9554; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm22 {%k1} 9555; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} 9556; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1} 9557; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 9558; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) 9559; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) 9560; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 128(%rax) 9561; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%rax) 9562; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 256(%rax) 9563; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax) 9564; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 384(%rax) 9565; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) 9566; AVX512DQ-BW-FCP-NEXT: vzeroupper 9567; AVX512DQ-BW-FCP-NEXT: retq 9568 %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 9569 %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64 9570 %in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64 9571 %in.vec3 = load <64 x i8>, ptr %in.vecptr3, align 64 9572 %in.vec4 = load <64 x i8>, ptr %in.vecptr4, align 64 9573 %in.vec5 = load <64 x i8>, ptr %in.vecptr5, align 64 9574 %in.vec6 = load <64 x i8>, ptr %in.vecptr6, align 64 9575 %in.vec7 = load <64 x i8>, ptr %in.vecptr7, align 64 9576 %1 = shufflevector <64 x i8> %in.vec0, <64 x i8> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 9577 %2 = shufflevector <64 x i8> %in.vec2, <64 x i8> %in.vec3, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 9578 %3 = shufflevector <64 x i8> %in.vec4, <64 x i8> %in.vec5, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 9579 %4 = shufflevector <64 x i8> %in.vec6, <64 x i8> %in.vec7, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 9580 %5 = shufflevector <128 x i8> %1, <128 x i8> %2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> 9581 %6 = shufflevector <128 x i8> %3, <128 x i8> %4, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> 9582 %7 = shufflevector <256 x i8> %5, <256 x i8> %6, <512 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255, i32 256, i32 257, i32 258, i32 259, i32 260, i32 261, i32 262, i32 263, i32 264, i32 265, i32 266, i32 267, i32 268, i32 269, i32 270, i32 271, i32 272, i32 273, i32 274, i32 275, i32 276, i32 277, i32 278, i32 279, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 304, i32 305, i32 306, i32 307, i32 308, i32 309, i32 310, i32 311, i32 312, i32 313, i32 314, i32 315, i32 316, i32 317, i32 318, i32 319, i32 320, i32 321, i32 322, i32 323, i32 324, i32 325, i32 326, i32 327, i32 328, i32 329, i32 330, i32 331, i32 332, i32 333, i32 334, i32 335, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 359, i32 360, i32 361, i32 362, i32 363, i32 364, i32 365, i32 366, i32 367, i32 368, i32 369, i32 370, i32 371, i32 372, i32 373, i32 374, i32 375, i32 376, i32 377, i32 378, i32 379, i32 380, i32 381, i32 382, i32 383, i32 384, i32 385, i32 386, i32 387, i32 388, i32 389, i32 390, i32 391, i32 392, i32 393, i32 394, i32 395, i32 396, i32 397, i32 398, i32 399, i32 400, i32 401, i32 402, i32 403, i32 404, i32 405, i32 406, i32 407, i32 408, i32 409, i32 410, i32 411, i32 412, i32 413, i32 414, i32 415, i32 416, i32 417, i32 418, i32 419, i32 420, i32 421, i32 422, i32 423, i32 424, i32 425, i32 426, i32 427, i32 428, i32 429, i32 430, i32 431, i32 432, i32 433, i32 434, i32 435, i32 436, i32 437, i32 438, i32 439, i32 440, i32 441, i32 442, i32 443, i32 444, i32 445, i32 446, i32 447, i32 448, i32 449, i32 450, i32 451, i32 452, i32 453, i32 454, i32 455, i32 456, i32 457, i32 458, i32 459, i32 460, i32 461, i32 462, i32 463, i32 464, i32 465, i32 466, i32 467, i32 468, i32 469, i32 470, i32 471, i32 472, i32 473, i32 474, i32 475, i32 476, i32 477, i32 478, i32 479, i32 480, i32 481, i32 482, i32 483, i32 484, i32 485, i32 486, i32 487, i32 488, i32 489, i32 490, i32 491, i32 492, i32 493, i32 494, i32 495, i32 496, i32 497, i32 498, i32 499, i32 500, i32 501, i32 502, i32 503, i32 504, i32 505, i32 506, i32 507, i32 508, i32 509, i32 510, i32 511> 9583 %interleaved.vec = shufflevector <512 x i8> %7, <512 x i8> poison, <512 x i32> <i32 0, i32 64, i32 128, i32 192, i32 256, i32 320, i32 384, i32 448, i32 1, i32 65, i32 129, i32 193, i32 257, i32 321, i32 385, i32 449, i32 2, i32 66, i32 130, i32 194, i32 258, i32 322, i32 386, i32 450, i32 3, i32 67, i32 131, i32 195, i32 259, i32 323, i32 387, i32 451, i32 4, i32 68, i32 132, i32 196, i32 260, i32 324, i32 388, i32 452, i32 5, i32 69, i32 133, i32 197, i32 261, i32 325, i32 389, i32 453, i32 6, i32 70, i32 134, i32 198, i32 262, i32 326, i32 390, i32 454, i32 7, i32 71, i32 135, i32 199, i32 263, i32 327, i32 391, i32 455, i32 8, i32 72, i32 136, i32 200, i32 264, i32 328, i32 392, i32 456, i32 9, i32 73, i32 137, i32 201, i32 265, i32 329, i32 393, i32 457, i32 10, i32 74, i32 138, i32 202, i32 266, i32 330, i32 394, i32 458, i32 11, i32 75, i32 139, i32 203, i32 267, i32 331, i32 395, i32 459, i32 12, i32 76, i32 140, i32 204, i32 268, i32 332, i32 396, i32 460, i32 13, i32 77, i32 141, i32 205, i32 269, i32 333, i32 397, i32 461, i32 14, i32 78, i32 142, i32 206, i32 270, i32 334, i32 398, i32 462, i32 15, i32 79, i32 143, i32 207, i32 271, i32 335, i32 399, i32 463, i32 16, i32 80, i32 144, i32 208, i32 272, i32 336, i32 400, i32 464, i32 17, i32 81, i32 145, i32 209, i32 273, i32 337, i32 401, i32 465, i32 18, i32 82, i32 146, i32 210, i32 274, i32 338, i32 402, i32 466, i32 19, i32 83, i32 147, i32 211, i32 275, i32 339, i32 403, i32 467, i32 20, i32 84, i32 148, i32 212, i32 276, i32 340, i32 404, i32 468, i32 21, i32 85, i32 149, i32 213, i32 277, i32 341, i32 405, i32 469, i32 22, i32 86, i32 150, i32 214, i32 278, i32 342, i32 406, i32 470, i32 23, i32 87, i32 151, i32 215, i32 279, i32 343, i32 407, i32 471, i32 24, i32 88, i32 152, i32 216, i32 280, i32 344, i32 408, i32 472, i32 25, i32 89, i32 153, i32 217, i32 281, i32 345, i32 409, i32 473, i32 26, i32 90, i32 154, i32 218, i32 282, i32 346, i32 410, i32 474, i32 27, i32 91, i32 155, i32 219, i32 283, i32 347, i32 411, i32 475, i32 28, i32 92, i32 156, i32 220, i32 284, i32 348, i32 412, i32 476, i32 29, i32 93, i32 157, i32 221, i32 285, i32 349, i32 413, i32 477, i32 30, i32 94, i32 158, i32 222, i32 286, i32 350, i32 414, i32 478, i32 31, i32 95, i32 159, i32 223, i32 287, i32 351, i32 415, i32 479, i32 32, i32 96, i32 160, i32 224, i32 288, i32 352, i32 416, i32 480, i32 33, i32 97, i32 161, i32 225, i32 289, i32 353, i32 417, i32 481, i32 34, i32 98, i32 162, i32 226, i32 290, i32 354, i32 418, i32 482, i32 35, i32 99, i32 163, i32 227, i32 291, i32 355, i32 419, i32 483, i32 36, i32 100, i32 164, i32 228, i32 292, i32 356, i32 420, i32 484, i32 37, i32 101, i32 165, i32 229, i32 293, i32 357, i32 421, i32 485, i32 38, i32 102, i32 166, i32 230, i32 294, i32 358, i32 422, i32 486, i32 39, i32 103, i32 167, i32 231, i32 295, i32 359, i32 423, i32 487, i32 40, i32 104, i32 168, i32 232, i32 296, i32 360, i32 424, i32 488, i32 41, i32 105, i32 169, i32 233, i32 297, i32 361, i32 425, i32 489, i32 42, i32 106, i32 170, i32 234, i32 298, i32 362, i32 426, i32 490, i32 43, i32 107, i32 171, i32 235, i32 299, i32 363, i32 427, i32 491, i32 44, i32 108, i32 172, i32 236, i32 300, i32 364, i32 428, i32 492, i32 45, i32 109, i32 173, i32 237, i32 301, i32 365, i32 429, i32 493, i32 46, i32 110, i32 174, i32 238, i32 302, i32 366, i32 430, i32 494, i32 47, i32 111, i32 175, i32 239, i32 303, i32 367, i32 431, i32 495, i32 48, i32 112, i32 176, i32 240, i32 304, i32 368, i32 432, i32 496, i32 49, i32 113, i32 177, i32 241, i32 305, i32 369, i32 433, i32 497, i32 50, i32 114, i32 178, i32 242, i32 306, i32 370, i32 434, i32 498, i32 51, i32 115, i32 179, i32 243, i32 307, i32 371, i32 435, i32 499, i32 52, i32 116, i32 180, i32 244, i32 308, i32 372, i32 436, i32 500, i32 53, i32 117, i32 181, i32 245, i32 309, i32 373, i32 437, i32 501, i32 54, i32 118, i32 182, i32 246, i32 310, i32 374, i32 438, i32 502, i32 55, i32 119, i32 183, i32 247, i32 311, i32 375, i32 439, i32 503, i32 56, i32 120, i32 184, i32 248, i32 312, i32 376, i32 440, i32 504, i32 57, i32 121, i32 185, i32 249, i32 313, i32 377, i32 441, i32 505, i32 58, i32 122, i32 186, i32 250, i32 314, i32 378, i32 442, i32 506, i32 59, i32 123, i32 187, i32 251, i32 315, i32 379, i32 443, i32 507, i32 60, i32 124, i32 188, i32 252, i32 316, i32 380, i32 444, i32 508, i32 61, i32 125, i32 189, i32 253, i32 317, i32 381, i32 445, i32 509, i32 62, i32 126, i32 190, i32 254, i32 318, i32 382, i32 446, i32 510, i32 63, i32 127, i32 191, i32 255, i32 319, i32 383, i32 447, i32 511> 9584 store <512 x i8> %interleaved.vec, ptr %out.vec, align 64 9585 ret void 9586} 9587